6. Análisis descriptivo
6.1 Datos de análisis y tablas
# ANALISIS SI EL CLIENTE ES MIEMBRO
miembros <- filter(supermarket, Customer.type == "Member")
summary(miembros)
## City Customer.type Gender Product.line
## Length:501 Length:501 Length:501 Length:501
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Unit.price Quantity Tax.5. Total
## Min. :10.08 Min. : 1.000 Min. : 0.5085 Min. : 10.68
## 1st Qu.:32.25 1st Qu.: 3.000 1st Qu.: 5.6310 1st Qu.: 118.25
## Median :56.04 Median : 5.000 Median :12.6680 Median : 266.03
## Mean :56.21 Mean : 5.559 Mean :15.6091 Mean : 327.79
## 3rd Qu.:79.93 3rd Qu.: 8.000 3rd Qu.:23.1225 3rd Qu.: 485.57
## Max. :99.96 Max. :10.000 Max. :49.6500 Max. :1042.65
## datetime Payment cogs
## Min. :2019-01-01 11:36:00 Length:501 Min. : 10.17
## 1st Qu.:2019-01-24 17:37:00 Class :character 1st Qu.:112.62
## Median :2019-02-12 17:49:00 Mode :character Median :253.36
## Mean :2019-02-14 05:45:02 Mean :312.18
## 3rd Qu.:2019-03-06 15:31:00 3rd Qu.:462.45
## Max. :2019-03-30 20:37:00 Max. :993.00
## Rating tmed day month
## Min. : 4.00 Min. :19.80 Min. :1.000 Min. :1.000
## 1st Qu.: 5.40 1st Qu.:25.00 1st Qu.:3.000 1st Qu.:1.000
## Median : 7.00 Median :27.00 Median :4.000 Median :2.000
## Mean : 6.94 Mean :26.67 Mean :4.064 Mean :1.988
## 3rd Qu.: 8.50 3rd Qu.:28.30 3rd Qu.:6.000 3rd Qu.:3.000
## Max. :10.00 Max. :34.00 Max. :7.000 Max. :3.000
## week hour daynum
## Min. : 1.000 Min. :10.00 Min. : 1.00
## 1st Qu.: 4.000 1st Qu.:12.00 1st Qu.: 7.00
## Median : 7.000 Median :15.00 Median :15.00
## Mean : 6.944 Mean :14.97 Mean :14.96
## 3rd Qu.:10.000 3rd Qu.:18.00 3rd Qu.:23.00
## Max. :13.000 Max. :20.00 Max. :31.00
#mediana precio unidad 56.04
#mediana cantidad 5
#mediana rating 7
#mediana total 266
IngresosMiembros = sum(miembros$Total)
IngresosMiembros #164223
## [1] 164223.4
CantidadesMiembros = sum(miembros$Quantity)
CantidadesMiembros #2785
## [1] 2785
ModaMiembroGenero = mlv(miembros$Gender, method = "mfv")
ModaMiembroGenero #mujeres
## [1] "Female"
kable(table(miembros$Gender),caption = "Frecuencia Genero(Miembro)")
Frecuencia Genero(Miembro)
|
Var1
|
Freq
|
|
Female
|
261
|
|
Male
|
240
|
#261 mujeres
ModaMiembroProductos = mlv(miembros$Product.line, method = "mfv")
ModaMiembroProductos #Food&Beverages 94
## [1] "Food&Beverages"
kable(table(miembros$Product.line),caption = "Frecuencia Productos(Miembro)")
Frecuencia Productos(Miembro)
|
Var1
|
Freq
|
|
Electronic
|
78
|
|
Fashion_accessories
|
86
|
|
Food&Beverages
|
94
|
|
Health&Beauty
|
73
|
|
Home&Lifestyle
|
83
|
|
Sports&Travel
|
87
|
kable(table(miembros$Payment),caption = "Frecuencia Pago(Miembro)")
Frecuencia Pago(Miembro)
|
Var1
|
Freq
|
|
Cash
|
168
|
|
Credit card
|
172
|
|
Ewallet
|
161
|
#172 Credit card
kable(table(miembros$day),caption = "Frecuencia dias(Miembro)")
Frecuencia dias(Miembro)
|
Var1
|
Freq
|
|
1
|
66
|
|
2
|
59
|
|
3
|
90
|
|
4
|
76
|
|
5
|
64
|
|
6
|
64
|
|
7
|
82
|
#90 martes
kable(table(hour(miembros$datetime)),caption = "Frecuencia Horas(Miembro)")
Frecuencia Horas(Miembro)
|
Var1
|
Freq
|
|
10
|
42
|
|
11
|
46
|
|
12
|
46
|
|
13
|
49
|
|
14
|
48
|
|
15
|
56
|
|
16
|
37
|
|
17
|
36
|
|
18
|
45
|
|
19
|
61
|
|
20
|
35
|
# 19:00 61compras
ModaMiembroRating = mlv(miembros$Rating, method = "mfv")
ModaMiembroRating #6.6 (14)y 9.5 (14)
## [1] 6.6 9.5
kable(table(miembros$Rating),caption = "Frecuencia Rating(Miembro)")
Frecuencia Rating(Miembro)
|
Var1
|
Freq
|
|
4
|
7
|
|
4.1
|
9
|
|
4.2
|
11
|
|
4.3
|
11
|
|
4.4
|
7
|
|
4.5
|
10
|
|
4.6
|
6
|
|
4.7
|
8
|
|
4.8
|
6
|
|
4.9
|
10
|
|
5
|
13
|
|
5.1
|
8
|
|
5.2
|
7
|
|
5.3
|
6
|
|
5.4
|
8
|
|
5.5
|
8
|
|
5.6
|
11
|
|
5.7
|
6
|
|
5.8
|
7
|
|
5.9
|
9
|
|
6
|
12
|
|
6.1
|
7
|
|
6.2
|
6
|
|
6.3
|
5
|
|
6.4
|
6
|
|
6.5
|
10
|
|
6.6
|
14
|
|
6.7
|
9
|
|
6.8
|
6
|
|
6.9
|
7
|
|
7
|
13
|
|
7.1
|
7
|
|
7.2
|
6
|
|
7.3
|
12
|
|
7.4
|
5
|
|
7.5
|
8
|
|
7.6
|
9
|
|
7.7
|
8
|
|
7.8
|
8
|
|
7.9
|
7
|
|
8
|
11
|
|
8.1
|
8
|
|
8.2
|
5
|
|
8.3
|
5
|
|
8.4
|
11
|
|
8.5
|
12
|
|
8.6
|
9
|
|
8.7
|
10
|
|
8.8
|
8
|
|
8.9
|
9
|
|
9
|
6
|
|
9.1
|
8
|
|
9.2
|
6
|
|
9.3
|
5
|
|
9.4
|
6
|
|
9.5
|
14
|
|
9.6
|
7
|
|
9.7
|
8
|
|
9.8
|
11
|
|
9.9
|
7
|
|
10
|
2
|
# ANALISIS SI EL CLIENTE ES NORMAL
normal <- filter(supermarket, Customer.type == "Normal")
summary(normal)
## City Customer.type Gender Product.line
## Length:499 Length:499 Length:499 Length:499
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Unit.price Quantity Tax.5. Total
## Min. :10.56 Min. : 1.000 Min. : 0.6045 Min. : 12.69
## 1st Qu.:33.23 1st Qu.: 3.000 1st Qu.: 6.1540 1st Qu.: 129.23
## Median :54.28 Median : 5.000 Median :11.3060 Median : 237.43
## Mean :55.14 Mean : 5.461 Mean :15.1487 Mean : 318.12
## 3rd Qu.:76.46 3rd Qu.: 8.000 3rd Qu.:22.0290 3rd Qu.: 462.61
## Max. :99.96 Max. :10.000 Max. :49.4900 Max. :1039.29
## datetime Payment cogs
## Min. :2019-01-01 10:39:00 Length:499 Min. : 12.09
## 1st Qu.:2019-01-24 19:23:00 Class :character 1st Qu.:123.08
## Median :2019-02-15 12:44:00 Mode :character Median :226.12
## Mean :2019-02-15 01:18:12 Mean :302.97
## 3rd Qu.:2019-03-08 20:19:30 3rd Qu.:440.58
## Max. :2019-03-30 14:58:00 Max. :989.80
## Rating tmed day month
## Min. : 4.000 Min. :19.80 Min. :1.000 Min. :1.000
## 1st Qu.: 5.700 1st Qu.:25.00 1st Qu.:2.000 1st Qu.:1.000
## Median : 7.000 Median :27.00 Median :4.000 Median :2.000
## Mean : 7.005 Mean :26.72 Mean :4.138 Mean :1.998
## 3rd Qu.: 8.400 3rd Qu.:28.40 3rd Qu.:6.000 3rd Qu.:3.000
## Max. :10.000 Max. :34.00 Max. :7.000 Max. :3.000
## week hour daynum
## Min. : 1.000 Min. :10.00 Min. : 1.00
## 1st Qu.: 4.000 1st Qu.:12.00 1st Qu.: 8.00
## Median : 7.000 Median :15.00 Median :15.00
## Mean : 7.048 Mean :14.85 Mean :15.55
## 3rd Qu.:10.000 3rd Qu.:18.00 3rd Qu.:23.00
## Max. :13.000 Max. :20.00 Max. :31.00
#mediana precio unidad 54.28
#mediana cantidad 5
#mediana rating 7
#mediana total 237.43
IngresosNormal = sum(normal$Total)
IngresosNormal #158743.3
## [1] 158743.3
CantidadesNormal = sum(normal$Quantity)
CantidadesNormal #2725
## [1] 2725
ModaNormalGenero = mlv(normal$Gender, method = "mfv")
ModaNormalGenero #hombres
## [1] "Male"
kable(table(normal$Gender),caption = "Frecuencia Genero(Normal)") #259 hombres
Frecuencia Genero(Normal)
|
Var1
|
Freq
|
|
Female
|
240
|
|
Male
|
259
|
ModaNormalProductos = mlv(normal$Product.line, method = "mfv")
ModaNormalProductos #Electronic 92 y Fashion_accessories 92
## [1] "Electronic" "Fashion_accessories"
kable(table(normal$Product.line),caption = "Frecuencia Productos(Normal)")
Frecuencia Productos(Normal)
|
Var1
|
Freq
|
|
Electronic
|
92
|
|
Fashion_accessories
|
92
|
|
Food&Beverages
|
80
|
|
Health&Beauty
|
79
|
|
Home&Lifestyle
|
77
|
|
Sports&Travel
|
79
|
kable(table(normal$Payment),caption = "Frecuencia Pagos(Normal)") #184 ewallet
Frecuencia Pagos(Normal)
|
Var1
|
Freq
|
|
Cash
|
176
|
|
Credit card
|
139
|
|
Ewallet
|
184
|
kable(table(normal$day),caption = "Frecuencia Dias(Normal)") #82 sabado
Frecuencia Dias(Normal)
|
Var1
|
Freq
|
|
1
|
67
|
|
2
|
66
|
|
3
|
68
|
|
4
|
67
|
|
5
|
74
|
|
6
|
75
|
|
7
|
82
|
kable(table(hour(normal$datetime)),caption = "Frecuencia Horas(Normal)") # 10:00 59 compras
Frecuencia Horas(Normal)
|
Var1
|
Freq
|
|
10
|
59
|
|
11
|
44
|
|
12
|
43
|
|
13
|
54
|
|
14
|
35
|
|
15
|
46
|
|
16
|
40
|
|
17
|
38
|
|
18
|
48
|
|
19
|
52
|
|
20
|
40
|
ModaNormalRating = mlv(normal$Rating, method = "mfv")
ModaNormalRating #6.2
## [1] 6.2
# Total de ingresos cada día
tapply(supermarket$Total, supermarket$day, FUN=sum)
## 1 2 3 4 5 6 7
## 44457.89 37899.08 51482.25 43731.14 45349.25 43926.34 56120.81
# Total de ingresos cada día y de total de cantidad de unidades vendidas
aggregate(cbind(supermarket$Total,supermarket$Quantity), by=list(day=supermarket$day), FUN=sum)
## day V1 V2
## 1 1 44457.89 778
## 2 2 37899.08 638
## 3 3 51482.25 862
## 4 4 43731.14 800
## 5 5 45349.25 755
## 6 6 43926.34 758
## 7 7 56120.81 919
# Valoracion media cada día
aggregate(supermarket$Rating, by=list(day=supermarket$day), FUN=mean)
## day x
## 1 1 7.011278
## 2 2 7.153600
## 3 3 7.003165
## 4 4 6.805594
## 5 5 6.889855
## 6 6 7.076259
## 7 7 6.901829
# Cantidades vendidas cada dia
kable(addmargins(table(supermarket$Quantity, supermarket$day)),caption = "Cantidades vendidas cada día") %>%
kable_styling("striped", "condensed", full_width = F) %>%
column_spec(1, bold = T) %>%
row_spec(0, bold = T) %>%
row_spec(11, bold = T, color = "white", background = "#D7261E")%>%
column_spec(9, bold = T, color = "white", background = "#D7261E")
Cantidades vendidas cada día
|
|
1
|
2
|
3
|
4
|
5
|
6
|
7
|
Sum
|
|
1
|
11
|
19
|
16
|
16
|
14
|
18
|
18
|
112
|
|
2
|
12
|
6
|
19
|
16
|
15
|
10
|
13
|
91
|
|
3
|
10
|
13
|
14
|
11
|
13
|
14
|
15
|
90
|
|
4
|
12
|
19
|
17
|
11
|
14
|
15
|
21
|
109
|
|
5
|
16
|
15
|
15
|
17
|
12
|
15
|
12
|
102
|
|
6
|
12
|
10
|
18
|
15
|
15
|
14
|
14
|
98
|
|
7
|
16
|
16
|
12
|
11
|
14
|
12
|
21
|
102
|
|
8
|
15
|
7
|
10
|
12
|
16
|
10
|
15
|
85
|
|
9
|
9
|
11
|
19
|
13
|
10
|
15
|
15
|
92
|
|
10
|
20
|
9
|
18
|
21
|
15
|
16
|
20
|
119
|
|
Sum
|
133
|
125
|
158
|
143
|
138
|
139
|
164
|
1000
|
# Porcentaje de ventas cada día
kable(addmargins(prop.table(table(supermarket$Quantity, supermarket$day))*100),caption = "Porcentaje de ventas al día") %>%
kable_styling("striped", "condensed", full_width = F) %>%
column_spec(1, bold = T) %>%
row_spec(0, bold = T) %>%
row_spec(11, bold = T, color = "white", background = "#D7261E")%>%
column_spec(9, bold = T, color = "white", background = "#D7261E")
Porcentaje de ventas al día
|
|
1
|
2
|
3
|
4
|
5
|
6
|
7
|
Sum
|
|
1
|
1.1
|
1.9
|
1.6
|
1.6
|
1.4
|
1.8
|
1.8
|
11.2
|
|
2
|
1.2
|
0.6
|
1.9
|
1.6
|
1.5
|
1.0
|
1.3
|
9.1
|
|
3
|
1.0
|
1.3
|
1.4
|
1.1
|
1.3
|
1.4
|
1.5
|
9.0
|
|
4
|
1.2
|
1.9
|
1.7
|
1.1
|
1.4
|
1.5
|
2.1
|
10.9
|
|
5
|
1.6
|
1.5
|
1.5
|
1.7
|
1.2
|
1.5
|
1.2
|
10.2
|
|
6
|
1.2
|
1.0
|
1.8
|
1.5
|
1.5
|
1.4
|
1.4
|
9.8
|
|
7
|
1.6
|
1.6
|
1.2
|
1.1
|
1.4
|
1.2
|
2.1
|
10.2
|
|
8
|
1.5
|
0.7
|
1.0
|
1.2
|
1.6
|
1.0
|
1.5
|
8.5
|
|
9
|
0.9
|
1.1
|
1.9
|
1.3
|
1.0
|
1.5
|
1.5
|
9.2
|
|
10
|
2.0
|
0.9
|
1.8
|
2.1
|
1.5
|
1.6
|
2.0
|
11.9
|
|
Sum
|
13.3
|
12.5
|
15.8
|
14.3
|
13.8
|
13.9
|
16.4
|
100.0
|
6.2 Gráficos
windowsFonts("Arial" = windowsFont("Arial"))
supermarket$date <- as.Date(supermarket$datetime)
#Agrupar por días y mes con la suma de Total y Quantity
pordias <- supermarket %>%
group_by(date,month) %>%
summarise(Total = sum(Total), Quantity = sum(Quantity))
## `summarise()` regrouping output by 'date' (override with `.groups` argument)
# GRAFICO DE INGRESOS TOTALES
ggplot(data = pordias, aes(x = date, y = Total)) +
geom_line(color = "#00AFBB", size = 1) +
stat_smooth(method="loess", colour="red") +
geom_hline(yintercept = mean(pordias$Total),linetype="dashed", color="blue") +
annotate(geom="text", x=as.Date("2019-01-01"), y=3628,label="Media", size=2.5) +
xlab("Mes") +
theme_ipsum() +
ggtitle("Ingresos totales")
## `geom_smooth()` using formula 'y ~ x'

#GRAFICO DE CANTIDADES TOTALES VENDIDAS
ggplot(data = pordias, aes(x = date, y = Quantity)) +
geom_line(color = "#00AFBB", size = 1) +
stat_smooth(method="loess", colour="red") +
geom_hline(yintercept = mean(pordias$Quantity),linetype="dashed", color="blue") +
annotate(geom="text", x=as.Date("2019-01-01"), y=61.91,label="Media", size=2.5) +
xlab("Mes") +
theme_ipsum() +
ggtitle("Cantidades totales vendidas")
## `geom_smooth()` using formula 'y ~ x'

#AGRUPAMOS POR FECHA, MES Y CIUDAD CON SUMAS TOTALES DE INGRESOS Y CANTIDADES
pormes <- supermarket %>%
group_by(date,month,City) %>%
summarise(Total = sum(Total), Quantity = sum(Quantity))
## `summarise()` regrouping output by 'date', 'month' (override with `.groups` argument)
#INGRESOS TOTALES POR MES CON FACETAS DE CIUDADES
ggplot(data = pormes, aes(x = date, y = Total)) +
geom_line()+stat_smooth(method="loess", colour="red")+facet_wrap(~City,scale = "free") +
geom_hline(yintercept = mean(pormes$Total),linetype="dashed", color="blue") +
xlab("Mes") +
theme_ipsum() +
ggtitle("Ingresos totales por mes según ciudades")
## `geom_smooth()` using formula 'y ~ x'

#CANTIDADES POR MES CON FACETAS DE CIUDADES
ggplot(data = pormes, aes(x = date, y = Quantity)) +
geom_line() +
stat_smooth(method="loess", colour="red") +
facet_wrap(~City,scale = "free") +
geom_hline(yintercept = mean(pormes$Quantity),linetype="dashed", color="blue") +
xlab("Mes") +
theme_ipsum() +
ggtitle("Cantidades totales por mes según ciudades")
## `geom_smooth()` using formula 'y ~ x'

#AGRUPAMOS POR DIA DE LA SEMANA E INGRESOS Y CANTIDADES TOTALES
pordia <- supermarket %>%
group_by(day) %>%
summarise(Total = sum(Total), Quantity = sum(Quantity))
## `summarise()` ungrouping output (override with `.groups` argument)
pordia$day <- as.factor(pordia$day)
#GRAFICO INGRESOS TOTALES CADA DIA DE LA SEMANA
ggplot(data = pordia, aes(x = day, y = Total, fill=day)) +
geom_col(color = "#00AFBB", size = 1) +
geom_hline(yintercept = mean(pordia$Total),linetype="dashed", color="blue") +
xlab("Dias semana") +
theme_ipsum() +
scale_x_discrete(breaks = c(1,2,3,4,5,6,7), labels = c("Lunes", "Martes", 'Miercoles','Jueves','Viernes','Sabado','Domingo')) +
theme(axis.text.x = element_text(angle = 45, size=9)) +
coord_flip() +
scale_fill_discrete(name = "Días", labels = c("Lunes", "Martes", 'Miercoles','Jueves','Viernes','Sabado','Domingo')) +
scale_y_continuous(breaks = c(0,10000,20000,30000,40000,50000))+
ggtitle("Ingresos totales cada dia de la semana")

#GRAFICO CANTIDADES TOTALES VENDIDAS CADA DIA DE LA SEMANA
ggplot(data = pordia, aes(x = day, y = Quantity, fill=day)) +
geom_col(color = "#00AFBB", size = 1) +
geom_hline(yintercept = mean(pordia$Quantity),linetype="dashed", color="blue") +
xlab("Dias semana") +
theme_ipsum() +
scale_x_discrete(breaks = c(1,2,3,4,5,6,7), labels = c("Lunes", "Martes", 'Miercoles','Jueves','Viernes','Sabado','Domingo')) +
theme(axis.text.x = element_text(angle = 45, size=9)) +
coord_flip() +
scale_fill_discrete(name = "Días", labels = c("Lunes", "Martes", 'Miercoles','Jueves','Viernes','Sabado','Domingo')) +
scale_y_continuous(breaks = c(0,200,400,600,800)) +
ggtitle("Cantidades totales vendidas según dia de la semana")

#NUMERO DE TRANSACCIONES A LA HORA POR MES
monthly_trend <- ddply(supermarket, .(Hour = supermarket$hour, Month = supermarket$month), nrow)
monthly_trend$Month <- as.factor(monthly_trend$Month)
#GRAFICO DE LINEAS
ggplot(monthly_trend, aes(Hour, V1, group=Month)) +
geom_line(aes(color=Month),size =2) +
ggtitle(label = "Transacciones cada hora según mes") +
theme_minimal() +
theme(plot.title = element_text(hjust=0.5, lineheight = .8, face = "bold")) +
xlab("Hour") +
ylab("Número de transacciones") +
geom_hline(yintercept = mean(monthly_trend$V1))

#GRAFICO DE BARRAS
ggplot(monthly_trend, aes(Hour, V1, fill=Month)) +
geom_bar(stat = "identity") +
ggtitle(label = "Transacciones cada hora según mes") +
theme_minimal() +
theme(plot.title = element_text(hjust=0.5, lineheight = .8, face = "bold"))+
xlab("Hour") +
ylab("Número de transacciones")

#NUMERO DE TRANSACCIONES A LA HORA POR DIA
daily_trend <- ddply(supermarket, .(Hour = supermarket$hour, Day = supermarket$day), nrow)
daily_trend$Day <- as.factor(daily_trend$Day)
#GRAFICO DE LINEAS
ggplot(daily_trend, aes(Hour, V1, group=Day)) +
geom_line(aes(color=Day), size =2)+ggtitle(label = "Transacciones cada hora según día de la semana") +
theme_minimal() +
theme(plot.title = element_text(hjust=0.5, lineheight = .8, face = "bold")) +
xlab("Hour") +
ylab("Numero de transacciones") +
geom_hline(yintercept = mean(daily_trend$V1))

#GRAFICO DE BARRAS
ggplot(daily_trend, aes(Hour, V1, fill=Day)) +
geom_bar(stat = "identity") +
ggtitle(label = "Transacciones cada hora según día de la semana") +
theme_minimal() +
theme(plot.title = element_text(hjust=0.5, lineheight = .8, face = "bold")) +
xlab("Hour") +
ylab("Numero de transacciones")

ANALISIS CLIENTES
#GENERO/PRODUCTO / RATING POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Rating, fill = Product.line)) +
geom_boxplot(
varwidth = TRUE)+ #tamaño proporcional
xlab("Gender")+
geom_jitter(color="black", size=0.4, alpha=0.9) + scale_fill_viridis(discrete = TRUE, alpha=0.6)+ stat_summary(fun=mean, geom="point", shape=20, size=10, color="red", fill="red")+
facet_wrap(~Customer.type) +theme_ipsum()

#GENERO/PRODUCTO / QUANTITY POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Quantity, fill = Product.line)) +
geom_boxplot(outlier.colour="red",# custom outliers
outlier.fill="red",
outlier.size=3,
varwidth = TRUE)+ #tamaño proporcional
xlab("Gender")+
geom_jitter(color="black", size=0.4, alpha=0.9) + scale_fill_viridis(discrete = TRUE, alpha=0.6)+ stat_summary(fun=mean, geom="point", shape=20, size=10, color="red", fill="red")+
facet_wrap(~Customer.type)+theme_ipsum()

#GENERO/PRODUCTO / TOTAL POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Total, fill = Product.line)) +
geom_boxplot(outlier.colour="red",# custom outliers
outlier.fill="red",
outlier.size=3,
varwidth = TRUE)+ #tamaño proporcional
xlab("Gender")+
geom_jitter(color="black", size=0.4, alpha=0.9) + scale_fill_viridis(discrete = TRUE, alpha=0.6)+ stat_summary(fun=mean, geom="point", shape=20, size=10, color="red", fill="red")+
facet_wrap(~Customer.type) +theme_ipsum()

##########################
#PAGO/GENERO / TOTAL POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Total, fill = Payment)) +
geom_boxplot(outlier.colour="red",# custom outliers
outlier.fill="red",
outlier.size=3,
varwidth = TRUE)+ #tamaño proporcional
xlab("Gender")+
geom_jitter(color="black", size=0.4, alpha=0.9) + scale_fill_viridis(discrete = TRUE, alpha=0.6)+ stat_summary(fun=mean, geom="point", shape=20, size=10, color="red", fill="red")+
facet_wrap(~Customer.type) +theme_ipsum()

#PAGO/GENERO / QUANTITY POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Quantity, fill = Payment)) +
geom_boxplot(outlier.colour="red",# custom outliers
outlier.fill="red",
outlier.size=3,
varwidth = TRUE)+ #tamaño proporcional
xlab("Gender")+
geom_jitter(color="black", size=0.4, alpha=0.9) + scale_fill_viridis(discrete = TRUE, alpha=0.6)+ stat_summary(fun=mean, geom="point", shape=20, size=10, color="red", fill="red")+
facet_wrap(~Customer.type)+theme_ipsum()

#PAGO/GENERO / RATING POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Rating, fill = Payment)) +
geom_boxplot(outlier.colour="red",# custom outliers
outlier.fill="red",
outlier.size=3,
varwidth = TRUE)+ #tamaño proporcional
xlab("Gender")+
geom_jitter(color="black", size=0.4, alpha=0.9) + scale_fill_viridis(discrete = TRUE, alpha=0.6)+ stat_summary(fun=mean, geom="point", shape=20, size=10, color="red", fill="red") +
facet_wrap(~Customer.type)+theme_ipsum()

#############################
#PRECIO
ggplot(supermarket, aes(x=as.factor(Gender), y=Unit.price, fill = Payment)) +
geom_boxplot(outlier.colour="red",# custom outliers
outlier.fill="red",
outlier.size=3,
varwidth = TRUE)+ #tamaño proporcional
xlab("Gender")+
geom_jitter(color="black", size=0.4, alpha=0.9) + scale_fill_viridis(discrete = TRUE, alpha=0.6)+ stat_summary(fun=mean, geom="point", shape=20, size=10, color="red", fill="red") +
facet_wrap(~Customer.type)+theme_ipsum()

ggplot(supermarket, aes(x=as.factor(Gender), y=Unit.price, fill = Product.line)) +
geom_boxplot(outlier.colour="red",# custom outliers
outlier.fill="red",
outlier.size=3,
varwidth = TRUE)+ #tamaño proporcional
xlab("Gender")+
geom_jitter(color="black", size=0.4, alpha=0.9) + scale_fill_viridis(discrete = TRUE, alpha=0.6)+ stat_summary(fun=mean, geom="point", shape=20, size=10, color="red", fill="red") +
facet_wrap(~Customer.type)+theme_ipsum()

ggplot(supermarket, aes(x=as.factor(Quantity), y=Unit.price))+geom_boxplot()

#########################################################
#LO MISMO QUE LO ANTERIOR PERO CON VIOLINES
#PAGO/ GENERO / RATING POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Rating, fill = Payment)) +
geom_violin()+
xlab("Gender")+
geom_jitter(color="black", size=0.4, alpha=0.9) + scale_fill_viridis(discrete = TRUE, alpha=0.6)+ stat_summary(fun=mean, geom="point", shape=20, size=7, color="red", fill="red") +
facet_wrap(~Customer.type)+
coord_flip()+
scale_fill_viridis(discrete=TRUE)+
scale_color_viridis(discrete=TRUE)+
theme_ipsum() +
theme(legend.position="none")+
ggtitle("A Violin wrapping a boxplot")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

#PAGO/GENERO / QUANTITY POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Quantity, fill = Payment)) +
geom_violin()+
xlab("Gender")+
geom_jitter(color="black", size=0.4, alpha=0.9) + scale_fill_viridis(discrete = TRUE, alpha=0.6)+ stat_summary(fun=mean, geom="point", shape=20, size=7, color="red", fill="red") +
facet_wrap(~Customer.type)+
coord_flip()+
scale_fill_viridis(discrete=TRUE)+
scale_color_viridis(discrete=TRUE)+
theme_ipsum() +
theme(legend.position="none")+
ggtitle("A Violin wrapping a boxplot")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

#PAGO/GENERO / TOTAL POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Total, fill = Payment)) +
geom_violin()+
xlab("Gender")+
geom_jitter(color="black", size=0.4, alpha=0.9) + scale_fill_viridis(discrete = TRUE, alpha=0.6)+ stat_summary(fun=mean, geom="point", shape=20, size=7, color="red", fill="red") +
facet_wrap(~Customer.type)+
coord_flip()+
scale_fill_viridis(discrete=TRUE)+
scale_color_viridis(discrete=TRUE)+
theme_ipsum() +
theme(legend.position="none")+
ggtitle("A Violin wrapping a boxplot")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

#GENERO/PRODUCTO / TOTAL POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Total, fill = Product.line)) +
geom_violin()+
xlab("Gender")+
geom_jitter(color="black", size=0.4, alpha=0.9) + scale_fill_viridis(discrete = TRUE, alpha=0.6)+ stat_summary(fun=mean, geom="point", shape=20, size=7, color="red", fill="red") +
facet_wrap(~Customer.type)+
coord_flip()+
scale_fill_viridis(discrete=TRUE)+
scale_color_viridis(discrete=TRUE)+
theme_ipsum() +
theme(legend.position="none")+
ggtitle("A Violin wrapping a boxplot")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

#GENERO/PRODUCTO / QUANTITY POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Quantity, fill = Product.line)) +
geom_violin()+
xlab("Gender")+
geom_jitter(color="black", size=0.4, alpha=0.9) + scale_fill_viridis(discrete = TRUE, alpha=0.6)+ stat_summary(fun=mean, geom="point", shape=20, size=7, color="red", fill="red") +
facet_wrap(~Customer.type)+
coord_flip()+
scale_fill_viridis(discrete=TRUE)+
scale_color_viridis(discrete=TRUE)+
theme_ipsum() +
theme(legend.position="none")+
ggtitle("A Violin wrapping a boxplot")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

#GENERO/PRODUCTO / RATING POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Rating, fill = Product.line)) +
geom_violin()+
xlab("Gender")+
geom_jitter(color="black", size=0.4, alpha=0.9) + scale_fill_viridis(discrete = TRUE, alpha=0.6)+ stat_summary(fun=mean, geom="point", shape=20, size=7, color="red", fill="red") +
facet_wrap(~Customer.type)+
coord_flip()+
scale_fill_viridis(discrete=TRUE)+
scale_color_viridis(discrete=TRUE)+
theme_ipsum() +
theme(legend.position="none")+
ggtitle("A Violin wrapping a boxplot")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

ANALISIS DE RELACIONES
###################################################
#RELACION ENTRE CANTIDAD Y TOTAL
#CANTIDAD/GENERO / TOTAL #IMPORTANTE MUESTRA RELACION ENTRE A MÁS CANTIDAD MÁS INGRESOS
ggplot(supermarket, aes(x=as.factor(Quantity), y=Total)) +
geom_boxplot(fill = '#99d8c9', outlier.colour="red",# custom outliers
outlier.fill="red",
outlier.size=3,
varwidth = TRUE)+ #tamaño proporcional
xlab("Cantidad")+
geom_jitter(color="black", size=0.4, alpha=0.9) + scale_fill_viridis(discrete = TRUE, alpha=0.6)+ stat_summary(fun=mean, geom="point", shape=20, size=7, color="red", fill="red")+
ggtitle("Relación Quantity/Total")+
theme_ipsum()

#RELACION ENTRE PRECIO Y TOTAL
ggplot(supermarket, aes(x=Unit.price, y=Total,))+geom_jitter()+geom_smooth()+ggtitle("Relación Price/Total")+
theme_ipsum()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#NO RELACION ENTRE CANTIDAD Y PRECIO
ggplot(supermarket, aes(x=Quantity, y=Unit.price, fill=Product.line))+geom_jitter()+geom_smooth()+ggtitle("Relación Quantity/Price")+
theme_ipsum()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(supermarket, aes(x=hour, y=Total, fill=Product.line))+geom_jitter()+geom_smooth()+ggtitle("Relación Total/Hora")+
theme_ipsum()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

7. Clustering
# LEVES TRANSFORMACIONES
#Convertimos a dummy las variables categoricas
results <- fastDummies::dummy_cols(supermarket,remove_first_dummy = TRUE)
#Eliminamos las variables antiguas para quedarnos con los dummys
results[,1:4] <- NULL
results$Payment <- NULL
results$datetime <- NULL
results$date <- NULL
# Escalamos el dataset
resultsca <- scale(results)
7.1 Hallar mejor K para métodos no jerárquicos
#Establecemos semilla
set.seed(20) # para reproducir el mismo ejemplo
#HALLAR MEJOR K PARA NUESTROS CLUSTER, DIFERENTES METODOS:
#Criterio del codo
wss <- (nrow(resultsca)-1)*sum(apply(resultsca,2,var))
for (i in 2:10) wss[i] <- sum(kmeans(resultsca,centers=i)$withinss)
# 10 nº máximo de clusters a analizar
plot(1:10, wss, type="b", xlab="Número de Clusters",ylab="Suma de cuadrados dentro de los clusters",main="Cálculo del número óptimo de clusters con el criterio del codo")

# Elbow method
fviz_nbclust(resultsca, kmeans, method = "wss") +
geom_vline(xintercept = 3, linetype = 2)+
labs(subtitle = "Elbow method")

# Silhouette method
fviz_nbclust(resultsca, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette method")

# Gap statistic
fviz_nbclust(resultsca, kmeans, nstart = 25, method = "gap_stat", nboot = 50)+
labs(subtitle = "Gap statistic method")

# Criterio Calinski 2 y 3
model <- cascadeKM(resultsca, 1, 10, iter = 100)
plot(model, sortg = TRUE)

model$results[2,]
## 1 groups 2 groups 3 groups 4 groups 5 groups 6 groups 7 groups 8 groups
## NA 140.18042 115.62061 92.01490 80.72781 74.10937 68.89688 64.83061
## 9 groups 10 groups
## 60.60041 56.96084
# METODO GRAFICO CON INDEX DINDEX
# method ward.D2
NbClust(resultsca, distance="euclidean", min.nc=2, max.nc=10, method="ward.D2", index="dindex")

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## $All.index
## 2 3 4 5 6 7 8 9 10
## 4.5206 4.4159 4.3172 4.2293 4.1510 4.0990 4.0596 4.0219 3.9824
# method kmeans
NbClust(resultsca, distance="euclidean", min.nc=2, max.nc=10, method="kmeans", index="dindex")

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## $All.index
## 2 3 4 5 6 7 8 9 10
## 4.4667 4.3000 4.2326 4.1733 4.0877 4.0408 3.9545 3.9151 3.8840
# METODO GRAFICO CON INDEX HUBERT
# Method Ward.D2
NbClust(resultsca, distance="euclidean", min.nc=2, max.nc=10, method="ward.D2", index="hubert")

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## $All.index
## 2 3 4 5 6 7 8 9 10
## 3e-04 2e-04 2e-04 2e-04 3e-04 3e-04 3e-04 3e-04 3e-04
# Method kmeans
NbClust(resultsca, distance="euclidean", min.nc=2, max.nc=10, method="kmeans", index="hubert")

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## $All.index
## 2 3 4 5 6 7 8 9 10
## 2e-04 3e-04 3e-04 3e-04 3e-04 3e-04 3e-04 3e-04 3e-04
#NUMERO K RECOMENDADO SEGÚN 17 INDEX DISTINTOS MEDIANTE UN GRID
gridk <- expand.grid(
method = c('kmeans','ward.D2','average'),
index = c('sdbw','sdindex','frey','ptbiserial','ratkowsky','beale','pseudot2','duda','db','cindex','kl','ball','ch','hartigan','dunn','gap','mcclain'),
numk = 0
)
for(i in 1:nrow(gridk)) {
res <- NbClust(resultsca, distance = "euclidean", min.nc = 2, max.nc = 10, method = gridk$method[i], index = gridk$index[i])
gridk$numk[i] <- res$Best.nc
}
table(gridk$numk)
##
## 1 2 3 5 6 7 8 9 10
## 1 22 8 3 3 2 4 1 7
#el numero de k=2 es el más recomendado
7.2 Método K-Means
#MODELO CON KMEANS
set.seed(20)
k.means.fit <- kmeans(resultsca,2,25)
print(k.means.fit)
## K-means clustering with 2 clusters of sizes 658, 342
##
## Cluster means:
## Unit.price Quantity Tax.5. Total cogs Rating
## 1 -0.3907778 -0.4219030 -0.6134235 -0.6134235 -0.6134235 0.02481673
## 2 0.7518474 0.8117316 1.1802124 1.1802124 1.1802124 -0.04774681
## tmed day month week hour daynum
## 1 0.04618716 -0.00565389 0.04841003 0.04970702 -0.003710147 0.005340918
## 2 -0.08886301 0.01087795 -0.09313976 -0.09563514 0.007138237 -0.010275802
## City_Naypyitaw City_Yangon Customer.type_Normal Gender_Male
## 1 -0.02207874 0.02013748 0.02326500 0.03541700
## 2 0.04247898 -0.03874404 -0.04476132 -0.06814148
## Product.line_Fashion_accessories Product.line_Food&Beverages
## 1 0.01142089 0.01806243
## 2 -0.02197352 -0.03475169
## Product.line_Health&Beauty Product.line_Home&Lifestyle
## 1 -0.004298638 -0.005303556
## 2 0.008270478 0.010203918
## Product.line_Sports&Travel Payment_Credit card Payment_Ewallet
## 1 -0.01726056 0.007750807 -0.009618185
## 2 0.03320890 -0.014912372 0.018505164
##
## Clustering vector:
## [1] 2 2 1 1 1 1 2 1 2 2 2 2 2 1 1 2 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 2 1 2 1 1 1
## [38] 2 1 1 1 2 1 1 2 1 1 2 1 2 2 1 1 1 1 2 2 1 1 1 1 2 1 1 1 2 1 1 2 2 1 2 1 2
## [75] 1 1 1 2 1 1 1 1 1 1 2 1 2 1 2 1 1 2 2 2 2 2 1 2 1 1 1 1 1 1 1 2 1 2 2 1 1
## [112] 1 2 2 2 1 1 2 2 1 1 1 2 1 2 1 1 1 1 2 1 2 1 2 2 1 1 2 1 2 2 1 2 2 2 2 2 1
## [149] 2 1 2 1 1 2 1 1 2 1 2 2 1 2 1 2 1 1 2 1 1 1 2 2 1 2 1 1 1 2 1 2 1 1 1 2 1
## [186] 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 1 1 1 2 2 1 2 2 1 1 1 2 1 2 1 2 1 2 1 1 1 1
## [223] 1 2 1 2 1 1 2 1 1 1 1 1 2 2 2 2 1 2 1 1 2 1 2 1 2 2 1 1 2 1 2 2 2 1 1 2 1
## [260] 1 1 1 2 1 1 2 1 1 1 1 1 1 1 2 2 1 1 2 1 1 2 1 1 1 2 1 1 1 1 2 2 2 1 1 2 1
## [297] 2 2 2 2 1 1 1 2 1 1 2 2 2 1 1 1 1 2 1 1 1 2 1 1 1 1 2 1 1 1 2 2 1 1 1 2 1
## [334] 1 1 1 2 1 1 1 2 1 1 2 2 2 1 2 1 2 1 1 2 1 2 2 1 1 2 2 1 1 1 1 1 2 2 1 1 1
## [371] 2 2 1 1 2 1 1 2 1 2 1 2 2 2 1 2 1 1 1 2 1 1 1 1 1 2 1 1 2 2 1 1 2 1 1 1 1
## [408] 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 2 1 2 2 1 1 2 1 1 1 1 1 1 2 1 2 2 1 2 1 2 2
## [445] 1 1 1 1 1 2 1 2 2 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 1 2
## [482] 2 2 2 1 2 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 2 1 1 2 2 2 1
## [519] 1 2 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 2 1 1 2 2 1 2 1 1 2 1 2 1 1 1 1 1 1
## [556] 1 2 2 1 2 1 2 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 2 1 1 1 1 2 1 1
## [593] 1 1 1 1 2 2 1 1 1 1 1 1 2 1 1 1 2 1 2 1 2 1 1 2 1 1 1 2 2 1 1 1 1 2 1 2 1
## [630] 2 1 2 1 1 1 1 1 1 2 1 2 2 2 1 2 1 2 1 2 2 1 1 2 2 1 1 1 1 1 2 2 1 1 2 1 1
## [667] 1 2 1 1 2 1 2 1 2 1 1 2 2 2 1 1 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2
## [704] 1 1 2 1 2 1 1 1 2 2 1 2 1 2 1 1 2 2 1 2 1 1 1 1 2 1 1 2 1 1 1 1 2 1 1 1 1
## [741] 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 2 1 1 2 1 2 2 2 2 2 1 1 1 2 2 2 2 1 1 1 1 1
## [778] 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2 1 2 2 1 2 1 1 1 2 1 2 1 2 1 1 2 1 1 1 1 1 2
## [815] 1 1 1 1 1 1 2 1 2 2 2 1 1 2 2 2 1 1 2 1 2 1 1 1 1 1 2 1 1 1 2 1 1 1 2 1 1
## [852] 2 2 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 2 1 1 1 2 1 2 1 2
## [889] 1 2 1 1 2 1 2 2 1 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 2 2 1 1 2 1 2 2 2 1 1 2 1
## [926] 1 1 2 1 1 2 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2
## [963] 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 1 1 2 2 1 2 2 1 2 1 2 1 1 2 1
## [1000] 1
##
## Within cluster sum of squares by cluster:
## [1] 13224.50 6922.61
## (between_SS / total_SS = 12.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
attributes(k.means.fit)
## $names
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
##
## $class
## [1] "kmeans"
# tamaño de cada cluster
k.means.fit$size
## [1] 658 342
# dibujo del cluster
plotcluster(resultsca, k.means.fit$cluster)

fviz_cluster(k.means.fit, data = resultsca)

# Comprobando graficamente como queda con cada K
k2 <- kmeans(resultsca, centers = 2, nstart = 25)
k3 <- kmeans(resultsca, centers = 3, nstart = 25)
k4 <- kmeans(resultsca, centers = 4, nstart = 25)
k5 <- kmeans(resultsca, centers = 5, nstart = 25)
p1 <- fviz_cluster(k2, geom = "point", data = resultsca) + ggtitle('k = 2')
p2 <- fviz_cluster(k3, geom = "point", data = resultsca) + ggtitle('k = 3')
p3 <- fviz_cluster(k4, geom = "point", data = resultsca) + ggtitle('k = 4')
p4 <- fviz_cluster(k5, geom = "point", data = resultsca) + ggtitle('k = 5')
grid.arrange(p1, p2, p3, p4, nrow=2)

supe <- "https://github.com/Juanmick/TFM/blob/master/supermarket.rds?raw=true"
supermarket <- readRDS(url(supe))
# Creamos columna con los clientes que pertenecen a cada cluster
supermarket$cluster <- k.means.fit$cluster
df <- supermarket %>% group_by(cluster) %>%
summarise(mean=mean(Total))
## `summarise()` ungrouping output (override with `.groups` argument)
dfkm <- supermarket
table(dfkm$cluster)
##
## 1 2
## 658 342
#saveRDS(dfkm, file = "dfkm.rds")
#dfkm <- readRDS("C:/TFM/dfkm.rds")
# Caracteristicas cluster 1
dfkm1 <- filter(dfkm, cluster == 1)
describe(dfkm1)
## dfkm1
##
## 19 Variables 658 Observations
## --------------------------------------------------------------------------------
## City
## n missing distinct
## 658 0 3
##
## Value Mandalay Naypyitaw Yangon
## Frequency 219 209 230
## Proportion 0.333 0.318 0.350
## --------------------------------------------------------------------------------
## Customer.type
## n missing distinct
## 658 0 2
##
## Value Member Normal
## Frequency 322 336
## Proportion 0.489 0.511
## --------------------------------------------------------------------------------
## Gender
## n missing distinct
## 658 0 2
##
## Value Female Male
## Frequency 318 340
## Proportion 0.483 0.517
## --------------------------------------------------------------------------------
## Product.line
## n missing distinct
## 658 0 6
##
## lowest : Electronic Fashion_accessories Food&Beverages Health&Beauty Home&Lifestyle
## highest: Fashion_accessories Food&Beverages Health&Beauty Home&Lifestyle Sports&Travel
##
## Value Electronic Fashion_accessories Food&Beverages
## Frequency 111 120 119
## Proportion 0.169 0.182 0.181
##
## Value Health&Beauty Home&Lifestyle Sports&Travel
## Frequency 99 104 105
## Proportion 0.150 0.158 0.160
## --------------------------------------------------------------------------------
## Unit.price
## n missing distinct Info Mean Gmd .05 .10
## 658 0 626 1 45.32 27.77 13.58 16.24
## .25 .50 .75 .90 .95
## 24.75 40.57 62.86 82.98 92.19
##
## lowest : 10.08 10.13 10.16 10.17 10.18, highest: 99.69 99.70 99.79 99.82 99.89
## --------------------------------------------------------------------------------
## Quantity
## n missing distinct Info Mean Gmd .05 .10
## 658 0 10 0.984 4.277 2.949 1 1
## .25 .50 .75 .90 .95
## 2 4 6 8 9
##
## lowest : 1 2 3 4 5, highest: 6 7 8 9 10
##
## Value 1 2 3 4 5 6 7 8 9 10
## Frequency 112 91 90 96 75 57 41 33 32 31
## Proportion 0.170 0.138 0.137 0.146 0.114 0.087 0.062 0.050 0.049 0.047
## --------------------------------------------------------------------------------
## Tax.5.
## n missing distinct Info Mean Gmd .05 .10
## 658 0 650 1 8.197 5.41 1.531 2.352
## .25 .50 .75 .90 .95
## 4.154 7.768 11.998 15.139 16.484
##
## lowest : 0.5085 0.6045 0.6270 0.6390 0.6990
## highest: 18.3080 18.6390 19.1300 19.1555 19.4635
## --------------------------------------------------------------------------------
## Total
## n missing distinct Info Mean Gmd .05 .10
## 658 0 650 1 172.1 113.6 32.15 49.39
## .25 .50 .75 .90 .95
## 87.23 163.12 251.96 317.92 346.16
##
## lowest : 10.6785 12.6945 13.1670 13.4190 14.6790
## highest: 384.4680 391.4190 401.7300 402.2655 408.7335
## --------------------------------------------------------------------------------
## datetime
## n missing distinct Info
## 658 0 657 1
## Mean Gmd .05 .10
## 2019-02-15 21:23:32 2505485 2019-01-06 13:22:33 2019-01-11 16:07:00
## .25 .50 .75 .90
## 2019-01-25 18:47:15 2019-02-15 14:36:00 2019-03-09 12:11:15 2019-03-22 14:12:36
## .95
## 2019-03-26 14:47:30
##
## lowest : 2019-01-01 11:40:00 2019-01-01 11:43:00 2019-01-01 15:51:00 2019-01-01 19:31:00 2019-01-01 20:26:00
## highest: 2019-03-30 12:51:00 2019-03-30 13:22:00 2019-03-30 16:34:00 2019-03-30 17:04:00 2019-03-30 20:37:00
## --------------------------------------------------------------------------------
## Payment
## n missing distinct
## 658 0 3
##
## Value Cash Credit card Ewallet
## Frequency 227 207 224
## Proportion 0.345 0.315 0.340
## --------------------------------------------------------------------------------
## cogs
## n missing distinct Info Mean Gmd .05 .10
## 658 0 650 1 163.9 108.2 30.62 47.04
## .25 .50 .75 .90 .95
## 83.08 155.35 239.96 302.78 329.68
##
## lowest : 10.17 12.09 12.54 12.78 13.98, highest: 366.16 372.78 382.60 383.11 389.27
## --------------------------------------------------------------------------------
## Rating
## n missing distinct Info Mean Gmd .05 .10
## 658 0 61 1 7.015 1.957 4.285 4.700
## .25 .50 .75 .90 .95
## 5.700 7.000 8.500 9.400 9.700
##
## lowest : 4.0 4.1 4.2 4.3 4.4, highest: 9.6 9.7 9.8 9.9 10.0
## --------------------------------------------------------------------------------
## tmed
## n missing distinct Info Mean Gmd .05 .10
## 658 0 88 1 26.82 2.919 22.20 23.27
## .25 .50 .75 .90 .95
## 25.10 27.00 28.50 29.90 30.60
##
## lowest : 19.8 20.1 20.5 21.2 21.3, highest: 31.4 31.7 32.8 33.6 34.0
## --------------------------------------------------------------------------------
## day
## n missing distinct Info Mean Gmd
## 658 0 7 0.979 4.09 2.281
##
## lowest : 1 2 3 4 5, highest: 3 4 5 6 7
##
## Value 1 2 3 4 5 6 7
## Frequency 88 83 100 100 87 98 102
## Proportion 0.134 0.126 0.152 0.152 0.132 0.149 0.155
## --------------------------------------------------------------------------------
## month
## n missing distinct Info Mean Gmd
## 658 0 3 0.887 2.033 0.906
##
## Value 1 2 3
## Frequency 217 202 239
## Proportion 0.330 0.307 0.363
## --------------------------------------------------------------------------------
## week
## n missing distinct Info Mean Gmd .05 .10
## 658 0 13 0.994 7.175 4.153 1 2
## .25 .50 .75 .90 .95
## 4 7 10 12 13
##
## lowest : 1 2 3 4 5, highest: 9 10 11 12 13
##
## Value 1 2 3 4 5 6 7 8 9 10 11
## Frequency 36 45 47 57 50 66 47 45 55 59 53
## Proportion 0.055 0.068 0.071 0.087 0.076 0.100 0.071 0.068 0.084 0.090 0.081
##
## Value 12 13
## Frequency 51 47
## Proportion 0.078 0.071
## --------------------------------------------------------------------------------
## hour
## n missing distinct Info Mean Gmd .05 .10
## 658 0 11 0.991 14.9 3.691 10 10
## .25 .50 .75 .90 .95
## 12 15 18 19 20
##
## lowest : 10 11 12 13 14, highest: 16 17 18 19 20
##
## Value 10 11 12 13 14 15 16 17 18 19 20
## Frequency 70 57 61 71 44 68 52 49 67 66 53
## Proportion 0.106 0.087 0.093 0.108 0.067 0.103 0.079 0.074 0.102 0.100 0.081
## --------------------------------------------------------------------------------
## daynum
## n missing distinct Info Mean Gmd .05 .10
## 658 0 31 0.999 15.3 10 2.00 4.00
## .25 .50 .75 .90 .95
## 7.25 15.00 23.00 27.00 28.15
##
## lowest : 1 2 3 4 5, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## cluster
## n missing distinct Info Mean Gmd
## 658 0 1 0 1 0
##
## Value 1
## Frequency 658
## Proportion 1
## --------------------------------------------------------------------------------
# Caracteristicas cluster 2
dfkm2 <- filter(dfkm, cluster == 2)
describe(dfkm2)
## dfkm2
##
## 19 Variables 342 Observations
## --------------------------------------------------------------------------------
## City
## n missing distinct
## 342 0 3
##
## Value Mandalay Naypyitaw Yangon
## Frequency 113 119 110
## Proportion 0.330 0.348 0.322
## --------------------------------------------------------------------------------
## Customer.type
## n missing distinct
## 342 0 2
##
## Value Member Normal
## Frequency 179 163
## Proportion 0.523 0.477
## --------------------------------------------------------------------------------
## Gender
## n missing distinct
## 342 0 2
##
## Value Female Male
## Frequency 183 159
## Proportion 0.535 0.465
## --------------------------------------------------------------------------------
## Product.line
## n missing distinct
## 342 0 6
##
## lowest : Electronic Fashion_accessories Food&Beverages Health&Beauty Home&Lifestyle
## highest: Fashion_accessories Food&Beverages Health&Beauty Home&Lifestyle Sports&Travel
##
## Value Electronic Fashion_accessories Food&Beverages
## Frequency 59 58 55
## Proportion 0.173 0.170 0.161
##
## Value Health&Beauty Home&Lifestyle Sports&Travel
## Frequency 53 56 61
## Proportion 0.155 0.164 0.178
## --------------------------------------------------------------------------------
## Unit.price
## n missing distinct Info Mean Gmd .05 .10
## 342 0 335 1 75.59 19.66 44.84 51.10
## .25 .50 .75 .90 .95
## 63.45 77.30 90.18 97.26 98.98
##
## lowest : 33.21 34.21 35.54 36.98 37.55, highest: 99.78 99.82 99.83 99.92 99.96
## --------------------------------------------------------------------------------
## Quantity
## n missing distinct Info Mean Gmd
## 342 0 7 0.966 7.883 2.006
##
## lowest : 4 5 6 7 8, highest: 6 7 8 9 10
##
## Value 4 5 6 7 8 9 10
## Frequency 13 27 41 61 52 60 88
## Proportion 0.038 0.079 0.120 0.178 0.152 0.175 0.257
## --------------------------------------------------------------------------------
## Tax.5.
## n missing distinct Info Mean Gmd .05 .10
## 342 0 340 1 29.2 9.31 18.86 19.45
## .25 .50 .75 .90 .95
## 22.21 27.52 35.37 40.98 44.60
##
## lowest : 16.605 17.105 17.770 17.828 17.829, highest: 48.690 48.750 49.260 49.490 49.650
## --------------------------------------------------------------------------------
## Total
## n missing distinct Info Mean Gmd .05 .10
## 342 0 340 1 613.2 195.5 396.0 408.4
## .25 .50 .75 .90 .95
## 466.4 577.9 742.7 860.7 936.5
##
## lowest : 348.705 359.205 373.170 374.388 374.409
## highest: 1022.490 1023.750 1034.460 1039.290 1042.650
## --------------------------------------------------------------------------------
## datetime
## n missing distinct Info
## 342 0 340 1
## Mean Gmd .05 .10
## 2019-02-12 06:00:35 2495827 2019-01-06 11:27:09 2019-01-10 14:19:18
## .25 .50 .75 .90
## 2019-01-21 22:14:30 2019-02-09 01:10:00 2019-03-05 17:58:45 2019-03-19 18:56:54
## .95
## 2019-03-23 19:06:09
##
## lowest : 2019-01-01 10:39:00 2019-01-01 11:36:00 2019-01-01 13:55:00 2019-01-01 14:42:00 2019-01-01 14:47:00
## highest: 2019-03-30 10:18:00 2019-03-30 14:43:00 2019-03-30 14:58:00 2019-03-30 17:58:00 2019-03-30 19:26:00
## --------------------------------------------------------------------------------
## Payment
## n missing distinct
## 342 0 3
##
## Value Cash Credit card Ewallet
## Frequency 117 104 121
## Proportion 0.342 0.304 0.354
## --------------------------------------------------------------------------------
## cogs
## n missing distinct Info Mean Gmd .05 .10
## 342 0 340 1 584 186.2 377.1 389.0
## .25 .50 .75 .90 .95
## 444.2 550.4 707.3 819.7 891.9
##
## lowest : 332.10 342.10 355.40 356.56 356.58, highest: 973.80 975.00 985.20 989.80 993.00
## --------------------------------------------------------------------------------
## Rating
## n missing distinct Info Mean Gmd .05 .10
## 342 0 61 1 6.891 2.037 4.3 4.5
## .25 .50 .75 .90 .95
## 5.3 6.9 8.4 9.3 9.6
##
## lowest : 4.0 4.1 4.2 4.3 4.4, highest: 9.6 9.7 9.8 9.9 10.0
## --------------------------------------------------------------------------------
## tmed
## n missing distinct Info Mean Gmd .05 .10
## 342 0 76 0.999 26.47 2.885 22.10 23.02
## .25 .50 .75 .90 .95
## 24.90 26.50 28.20 29.79 30.29
##
## lowest : 19.8 20.1 20.5 21.2 21.3, highest: 31.0 31.4 31.7 32.8 34.0
## --------------------------------------------------------------------------------
## day
## n missing distinct Info Mean Gmd
## 342 0 7 0.978 4.123 2.316
##
## lowest : 1 2 3 4 5, highest: 3 4 5 6 7
##
## Value 1 2 3 4 5 6 7
## Frequency 45 42 58 43 51 41 62
## Proportion 0.132 0.123 0.170 0.126 0.149 0.120 0.181
## --------------------------------------------------------------------------------
## month
## n missing distinct Info Mean Gmd
## 342 0 3 0.883 1.915 0.9083
##
## Value 1 2 3
## Frequency 135 101 106
## Proportion 0.395 0.295 0.310
## --------------------------------------------------------------------------------
## week
## n missing distinct Info Mean Gmd .05 .10
## 342 0 13 0.993 6.652 4.091 1 2
## .25 .50 .75 .90 .95
## 4 6 10 12 12
##
## lowest : 1 2 3 4 5, highest: 9 10 11 12 13
##
## Value 1 2 3 4 5 6 7 8 9 10 11
## Frequency 19 28 35 36 33 26 25 15 32 29 25
## Proportion 0.056 0.082 0.102 0.105 0.096 0.076 0.073 0.044 0.094 0.085 0.073
##
## Value 12 13
## Frequency 25 14
## Proportion 0.073 0.041
## --------------------------------------------------------------------------------
## hour
## n missing distinct Info Mean Gmd .05 .10
## 342 0 11 0.99 14.93 3.622 10 11
## .25 .50 .75 .90 .95
## 12 15 18 19 20
##
## lowest : 10 11 12 13 14, highest: 16 17 18 19 20
##
## Value 10 11 12 13 14 15 16 17 18 19 20
## Frequency 31 33 28 32 39 34 25 25 26 47 22
## Proportion 0.091 0.096 0.082 0.094 0.114 0.099 0.073 0.073 0.076 0.137 0.064
## --------------------------------------------------------------------------------
## daynum
## n missing distinct Info Mean Gmd .05 .10
## 342 0 31 0.999 15.17 10.12 2 3
## .25 .50 .75 .90 .95
## 8 15 23 27 29
##
## lowest : 1 2 3 4 5, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## cluster
## n missing distinct Info Mean Gmd
## 342 0 1 0 2 0
##
## Value 2
## Frequency 342
## Proportion 1
## --------------------------------------------------------------------------------
7.3 Método con PAM (Partitioning Around Medoids)
#CLUSTER PAM
#con 2 k
set.seed(20)
pam.res <- pam(resultsca, 2)
resultadopam <- as.data.frame(pam.res$clustering)
table(resultadopam)
## resultadopam
## 1 2
## 670 330
#Visualizing PAM clusters con fviz_cluster()
fviz_cluster(pam.res)

# Change the color palette and theme
fviz_cluster(pam.res, resultsca,
palette = "Set2", ggtheme = theme_minimal())

#Para añadir la columna al dataset
supermarket$cluster <- pam.res$clustering
df <- supermarket %>% group_by(cluster) %>%
summarise(mean=mean(Total))
## `summarise()` ungrouping output (override with `.groups` argument)
dfpam <- supermarket
table(dfpam$cluster)
##
## 1 2
## 670 330
# Caracteristicas cluster 1
dfpam1 <- filter(dfpam, cluster == 1)
describe(dfpam1)
## dfpam1
##
## 19 Variables 670 Observations
## --------------------------------------------------------------------------------
## City
## n missing distinct
## 670 0 3
##
## Value Mandalay Naypyitaw Yangon
## Frequency 261 250 159
## Proportion 0.390 0.373 0.237
## --------------------------------------------------------------------------------
## Customer.type
## n missing distinct
## 670 0 2
##
## Value Member Normal
## Frequency 281 389
## Proportion 0.419 0.581
## --------------------------------------------------------------------------------
## Gender
## n missing distinct
## 670 0 2
##
## Value Female Male
## Frequency 277 393
## Proportion 0.413 0.587
## --------------------------------------------------------------------------------
## Product.line
## n missing distinct
## 670 0 6
##
## lowest : Electronic Fashion_accessories Food&Beverages Health&Beauty Home&Lifestyle
## highest: Fashion_accessories Food&Beverages Health&Beauty Home&Lifestyle Sports&Travel
##
## Value Electronic Fashion_accessories Food&Beverages
## Frequency 111 127 114
## Proportion 0.166 0.190 0.170
##
## Value Health&Beauty Home&Lifestyle Sports&Travel
## Frequency 110 95 113
## Proportion 0.164 0.142 0.169
## --------------------------------------------------------------------------------
## Unit.price
## n missing distinct Info Mean Gmd .05 .10
## 670 0 644 1 50.81 29.85 14.29 17.48
## .25 .50 .75 .90 .95
## 27.10 48.50 72.55 88.66 95.52
##
## lowest : 10.17 10.18 10.53 10.56 10.59, highest: 99.73 99.78 99.79 99.82 99.96
## --------------------------------------------------------------------------------
## Quantity
## n missing distinct Info Mean Gmd .05 .10
## 670 0 10 0.989 5.106 3.364 1.0 1.0
## .25 .50 .75 .90 .95
## 3.0 5.0 7.0 9.1 10.0
##
## lowest : 1 2 3 4 5, highest: 6 7 8 9 10
##
## Value 1 2 3 4 5 6 7 8 9 10
## Frequency 96 71 66 76 67 63 67 44 53 67
## Proportion 0.143 0.106 0.099 0.113 0.100 0.094 0.100 0.066 0.079 0.100
## --------------------------------------------------------------------------------
## Tax.5.
## n missing distinct Info Mean Gmd .05 .10
## 670 0 667 1 12.69 10.99 1.567 2.613
## .25 .50 .75 .90 .95
## 4.684 9.556 17.954 27.446 34.043
##
## lowest : 0.5085 0.6045 0.6270 0.6390 0.6990
## highest: 47.7200 47.7900 48.6850 48.7500 49.4900
## --------------------------------------------------------------------------------
## Total
## n missing distinct Info Mean Gmd .05 .10
## 670 0 667 1 266.6 230.8 32.90 54.86
## .25 .50 .75 .90 .95
## 98.37 200.67 377.03 576.36 714.91
##
## lowest : 10.6785 12.6945 13.1670 13.4190 14.6790
## highest: 1002.1200 1003.5900 1022.3850 1023.7500 1039.2900
## --------------------------------------------------------------------------------
## datetime
## n missing distinct Info
## 670 0 668 1
## Mean Gmd .05 .10
## 2019-02-09 07:42:43 2381919 2019-01-05 11:40:33 2019-01-08 14:29:30
## .25 .50 .75 .90
## 2019-01-19 12:46:15 2019-02-07 11:59:30 2019-03-02 19:43:30 2019-03-14 11:08:24
## .95
## 2019-03-19 15:01:24
##
## lowest : 2019-01-01 10:39:00 2019-01-01 11:36:00 2019-01-01 11:40:00 2019-01-01 11:43:00 2019-01-01 13:55:00
## highest: 2019-03-29 14:28:00 2019-03-29 14:44:00 2019-03-30 12:51:00 2019-03-30 14:58:00 2019-03-30 20:37:00
## --------------------------------------------------------------------------------
## Payment
## n missing distinct
## 670 0 3
##
## Value Cash Credit card Ewallet
## Frequency 263 235 172
## Proportion 0.393 0.351 0.257
## --------------------------------------------------------------------------------
## cogs
## n missing distinct Info Mean Gmd .05 .10
## 670 0 667 1 253.9 219.8 31.34 52.25
## .25 .50 .75 .90 .95
## 93.69 191.11 359.07 548.92 680.87
##
## lowest : 10.17 12.09 12.54 12.78 13.98, highest: 954.40 955.80 973.70 975.00 989.80
## --------------------------------------------------------------------------------
## Rating
## n missing distinct Info Mean Gmd .05 .10
## 670 0 61 1 7.112 1.957 4.300 4.800
## .25 .50 .75 .90 .95
## 5.725 7.100 8.600 9.500 9.700
##
## lowest : 4.0 4.1 4.2 4.3 4.4, highest: 9.6 9.7 9.8 9.9 10.0
## --------------------------------------------------------------------------------
## tmed
## n missing distinct Info Mean Gmd .05 .10
## 670 0 81 0.999 26.05 2.851 22.00 22.70
## .25 .50 .75 .90 .95
## 24.50 26.00 27.80 29.12 30.00
##
## lowest : 19.8 20.1 20.5 21.2 21.3, highest: 30.9 31.0 31.4 32.8 34.0
## --------------------------------------------------------------------------------
## day
## n missing distinct Info Mean Gmd
## 670 0 7 0.977 4.367 2.245
##
## lowest : 1 2 3 4 5, highest: 3 4 5 6 7
##
## Value 1 2 3 4 5 6 7
## Frequency 67 75 100 96 95 109 128
## Proportion 0.100 0.112 0.149 0.143 0.142 0.163 0.191
## --------------------------------------------------------------------------------
## month
## n missing distinct Info Mean Gmd
## 670 0 3 0.88 1.881 0.8925
##
## Value 1 2 3
## Frequency 272 206 192
## Proportion 0.406 0.307 0.287
## --------------------------------------------------------------------------------
## week
## n missing distinct Info Mean Gmd .05 .10
## 670 0 13 0.992 6.23 3.919 1 2
## .25 .50 .75 .90 .95
## 3 6 9 11 12
##
## lowest : 1 2 3 4 5, highest: 9 10 11 12 13
##
## Value 1 2 3 4 5 6 7 8 9 10 11
## Frequency 51 66 64 60 62 76 48 33 54 64 52
## Proportion 0.076 0.099 0.096 0.090 0.093 0.113 0.072 0.049 0.081 0.096 0.078
##
## Value 12 13
## Frequency 25 15
## Proportion 0.037 0.022
## --------------------------------------------------------------------------------
## hour
## n missing distinct Info Mean Gmd .05 .10
## 670 0 11 0.991 14.98 3.698 10.0 10.9
## .25 .50 .75 .90 .95
## 12.0 15.0 18.0 19.0 20.0
##
## lowest : 10 11 12 13 14, highest: 16 17 18 19 20
##
## Value 10 11 12 13 14 15 16 17 18 19 20
## Frequency 67 56 67 65 50 68 50 50 69 71 57
## Proportion 0.100 0.084 0.100 0.097 0.075 0.101 0.075 0.075 0.103 0.106 0.085
## --------------------------------------------------------------------------------
## daynum
## n missing distinct Info Mean Gmd .05 .10
## 670 0 31 0.998 13.24 9.624 2 3
## .25 .50 .75 .90 .95
## 6 12 20 26 28
##
## lowest : 1 2 3 4 5, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## cluster
## n missing distinct Info Mean Gmd
## 670 0 1 0 1 0
##
## Value 1
## Frequency 670
## Proportion 1
## --------------------------------------------------------------------------------
# Caracteristicas cluster 2
dfpam2 <- filter(dfpam, cluster == 2)
describe(dfpam2)
## dfpam2
##
## 19 Variables 330 Observations
## --------------------------------------------------------------------------------
## City
## n missing distinct
## 330 0 3
##
## Value Mandalay Naypyitaw Yangon
## Frequency 71 78 181
## Proportion 0.215 0.236 0.548
## --------------------------------------------------------------------------------
## Customer.type
## n missing distinct
## 330 0 2
##
## Value Member Normal
## Frequency 220 110
## Proportion 0.667 0.333
## --------------------------------------------------------------------------------
## Gender
## n missing distinct
## 330 0 2
##
## Value Female Male
## Frequency 224 106
## Proportion 0.679 0.321
## --------------------------------------------------------------------------------
## Product.line
## n missing distinct
## 330 0 6
##
## lowest : Electronic Fashion_accessories Food&Beverages Health&Beauty Home&Lifestyle
## highest: Fashion_accessories Food&Beverages Health&Beauty Home&Lifestyle Sports&Travel
##
## Value Electronic Fashion_accessories Food&Beverages
## Frequency 59 51 60
## Proportion 0.179 0.155 0.182
##
## Value Health&Beauty Home&Lifestyle Sports&Travel
## Frequency 42 65 53
## Proportion 0.127 0.197 0.161
## --------------------------------------------------------------------------------
## Unit.price
## n missing distinct Info Mean Gmd .05 .10
## 330 0 319 1 65.54 28.36 20.52 28.28
## .25 .50 .75 .90 .95
## 47.67 71.54 87.59 96.03 98.70
##
## lowest : 10.08 10.13 10.16 10.69 12.03, highest: 99.71 99.82 99.83 99.89 99.92
## --------------------------------------------------------------------------------
## Quantity
## n missing distinct Info Mean Gmd .05 .10
## 330 0 10 0.987 6.33 3.118 2 2
## .25 .50 .75 .90 .95
## 4 7 9 10 10
##
## lowest : 1 2 3 4 5, highest: 6 7 8 9 10
##
## Value 1 2 3 4 5 6 7 8 9 10
## Frequency 16 20 24 33 35 35 35 41 39 52
## Proportion 0.048 0.061 0.073 0.100 0.106 0.106 0.106 0.124 0.118 0.158
## --------------------------------------------------------------------------------
## Tax.5.
## n missing distinct Info Mean Gmd .05 .10
## 330 0 328 1 20.83 14.29 3.668 4.579
## .25 .50 .75 .90 .95
## 9.966 19.519 30.741 38.474 41.847
##
## lowest : 1.2030 1.2645 1.9300 1.9560 2.1480
## highest: 45.3250 48.6050 48.6900 49.2600 49.6500
## --------------------------------------------------------------------------------
## Total
## n missing distinct Info Mean Gmd .05 .10
## 330 0 328 1 437.5 300.1 77.04 96.16
## .25 .50 .75 .90 .95
## 209.28 409.89 645.57 807.95 878.78
##
## lowest : 25.2630 26.5545 40.5300 41.0760 45.1080
## highest: 951.8250 1020.7050 1022.4900 1034.4600 1042.6500
## --------------------------------------------------------------------------------
## datetime
## n missing distinct Info
## 330 0 329 1
## Mean Gmd .05 .10
## 2019-02-25 10:58:17 2378492 2019-01-17 02:32:54 2019-01-22 10:56:48
## .25 .50 .75 .90
## 2019-02-03 22:23:15 2019-02-26 17:19:00 2019-03-20 07:19:30 2019-03-26 14:39:24
## .95
## 2019-03-28 15:30:33
##
## lowest : 2019-01-01 14:42:00 2019-01-04 13:34:00 2019-01-05 13:08:00 2019-01-06 13:58:00 2019-01-07 15:01:00
## highest: 2019-03-30 14:43:00 2019-03-30 16:34:00 2019-03-30 17:04:00 2019-03-30 17:58:00 2019-03-30 19:26:00
## --------------------------------------------------------------------------------
## Payment
## n missing distinct
## 330 0 3
##
## Value Cash Credit card Ewallet
## Frequency 81 76 173
## Proportion 0.245 0.230 0.524
## --------------------------------------------------------------------------------
## cogs
## n missing distinct Info Mean Gmd .05 .10
## 330 0 328 1 416.7 285.8 73.37 91.58
## .25 .50 .75 .90 .95
## 199.31 390.38 614.83 769.48 836.94
##
## lowest : 24.06 25.29 38.60 39.12 42.96, highest: 906.50 972.10 973.80 985.20 993.00
## --------------------------------------------------------------------------------
## Rating
## n missing distinct Info Mean Gmd .05 .10
## 330 0 60 1 6.689 1.997 4.2 4.4
## .25 .50 .75 .90 .95
## 5.1 6.6 8.0 9.2 9.6
##
## lowest : 4.0 4.1 4.2 4.3 4.4, highest: 9.6 9.7 9.8 9.9 10.0
## --------------------------------------------------------------------------------
## tmed
## n missing distinct Info Mean Gmd .05 .10
## 330 0 69 0.999 28 2.432 24.23 25.28
## .25 .50 .75 .90 .95
## 26.50 28.00 29.50 30.50 31.00
##
## lowest : 21.6 22.1 22.2 22.7 23.2, highest: 31.4 31.7 32.8 33.6 34.0
## --------------------------------------------------------------------------------
## day
## n missing distinct Info Mean Gmd
## 330 0 7 0.976 3.561 2.238
##
## lowest : 1 2 3 4 5, highest: 3 4 5 6 7
##
## Value 1 2 3 4 5 6 7
## Frequency 66 50 58 47 43 30 36
## Proportion 0.200 0.152 0.176 0.142 0.130 0.091 0.109
## --------------------------------------------------------------------------------
## month
## n missing distinct Info Mean Gmd
## 330 0 3 0.861 2.221 0.8673
##
## Value 1 2 3
## Frequency 80 97 153
## Proportion 0.242 0.294 0.464
## --------------------------------------------------------------------------------
## week
## n missing distinct Info Mean Gmd .05 .10
## 330 0 13 0.989 8.552 3.942 3.00 4.00
## .25 .50 .75 .90 .95
## 5.25 9.00 12.00 13.00 13.00
##
## lowest : 1 2 3 4 5, highest: 9 10 11 12 13
##
## Value 1 2 3 4 5 6 7 8 9 10 11
## Frequency 4 7 18 33 21 16 24 27 33 24 26
## Proportion 0.012 0.021 0.055 0.100 0.064 0.048 0.073 0.082 0.100 0.073 0.079
##
## Value 12 13
## Frequency 51 46
## Proportion 0.155 0.139
## --------------------------------------------------------------------------------
## hour
## n missing distinct Info Mean Gmd .05 .10
## 330 0 11 0.99 14.77 3.598 10 10
## .25 .50 .75 .90 .95
## 12 15 18 19 20
##
## lowest : 10 11 12 13 14, highest: 16 17 18 19 20
##
## Value 10 11 12 13 14 15 16 17 18 19 20
## Frequency 34 34 22 38 33 34 27 24 24 42 18
## Proportion 0.103 0.103 0.067 0.115 0.100 0.103 0.082 0.073 0.073 0.127 0.055
## --------------------------------------------------------------------------------
## daynum
## n missing distinct Info Mean Gmd .05 .10
## 330 0 31 0.998 19.35 8.757 4 7
## .25 .50 .75 .90 .95
## 14 21 26 28 29
##
## lowest : 1 2 3 4 5, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## cluster
## n missing distinct Info Mean Gmd
## 330 0 1 0 2 0
##
## Value 2
## Frequency 330
## Proportion 1
## --------------------------------------------------------------------------------
7.4 Método con Hierarchical Cluster Analysis
set.seed(20)
#CON HCLUST
# Dissimilarity matrix, halla los valores de la distancia,necesario para hclust
d <- dist(resultsca, method = "euclidean")
# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "ward.D" )
# Dibujamos el dendograma con los 2 cluster
plot(hc1, cex = 0.6, hang = -1)
rect.hclust(hc1, k = 2, border = 2:5)

#Dibujo
fviz_dend(x = hc1, k = 2, cex = 0.6) +
geom_hline(yintercept = 250, linetype = "dashed") +
labs(title = "Hierarchical clustering",
subtitle = "Method Ward.D, K=2")
## Registered S3 method overwritten by 'dendextend':
## method from
## rev.hclust vegan

# Cortamos el dendograma en 2 grupos
sub_grp <- cutree(hc1, k = 2)
table(sub_grp)
## sub_grp
## 1 2
## 800 200
# Añadimos resultados del cluster al df
dfh <- supermarket
dfh$cluster <- sub_grp
df <- dfh %>% group_by(cluster) %>%
summarise(mean=mean(Total))
## `summarise()` ungrouping output (override with `.groups` argument)
# Dibujamos los cluster
fviz_cluster(list(data = resultsca, cluster = sub_grp))

saveRDS(dfh, file = "dfh.rds")
# Caracteristicas cluster 1
dfh1 <- filter(dfh, cluster == 1)
describe(dfh1)
## dfh1
##
## 19 Variables 800 Observations
## --------------------------------------------------------------------------------
## City
## n missing distinct
## 800 0 3
##
## Value Mandalay Naypyitaw Yangon
## Frequency 267 250 283
## Proportion 0.334 0.312 0.354
## --------------------------------------------------------------------------------
## Customer.type
## n missing distinct
## 800 0 2
##
## Value Member Normal
## Frequency 394 406
## Proportion 0.492 0.507
## --------------------------------------------------------------------------------
## Gender
## n missing distinct
## 800 0 2
##
## Value Female Male
## Frequency 396 404
## Proportion 0.495 0.505
## --------------------------------------------------------------------------------
## Product.line
## n missing distinct
## 800 0 6
##
## lowest : Electronic Fashion_accessories Food&Beverages Health&Beauty Home&Lifestyle
## highest: Fashion_accessories Food&Beverages Health&Beauty Home&Lifestyle Sports&Travel
##
## Value Electronic Fashion_accessories Food&Beverages
## Frequency 132 148 151
## Proportion 0.165 0.185 0.189
##
## Value Health&Beauty Home&Lifestyle Sports&Travel
## Frequency 127 130 112
## Proportion 0.159 0.162 0.140
## --------------------------------------------------------------------------------
## Unit.price
## n missing distinct Info Mean Gmd .05 .10
## 800 0 760 1 50.1 29.42 14.35 17.62
## .25 .50 .75 .90 .95
## 27.27 47.33 71.88 88.17 94.67
##
## lowest : 10.08 10.13 10.16 10.17 10.18, highest: 99.79 99.82 99.83 99.89 99.92
## --------------------------------------------------------------------------------
## Quantity
## n missing distinct Info Mean Gmd .05 .10
## 800 0 10 0.988 4.779 3.109 1 1
## .25 .50 .75 .90 .95
## 2 5 7 9 10
##
## lowest : 1 2 3 4 5, highest: 6 7 8 9 10
##
## Value 1 2 3 4 5 6 7 8 9 10
## Frequency 112 91 90 105 99 84 65 53 49 52
## Proportion 0.140 0.114 0.112 0.131 0.124 0.105 0.081 0.066 0.061 0.065
## --------------------------------------------------------------------------------
## Tax.5.
## n missing distinct Info Mean Gmd .05 .10
## 800 0 792 1 11.06 8.524 1.647 2.671
## .25 .50 .75 .90 .95
## 4.653 9.290 15.572 22.722 25.577
##
## lowest : 0.5085 0.6045 0.6270 0.6390 0.6990
## highest: 35.6900 40.9750 41.2900 43.7490 43.9350
## --------------------------------------------------------------------------------
## Total
## n missing distinct Info Mean Gmd .05 .10
## 800 0 792 1 232.3 179 34.59 56.10
## .25 .50 .75 .90 .95
## 97.72 195.08 327.01 477.17 537.11
##
## lowest : 10.6785 12.6945 13.1670 13.4190 14.6790
## highest: 749.4900 860.4750 867.0900 918.7290 922.6350
## --------------------------------------------------------------------------------
## datetime
## n missing distinct Info
## 800 0 798 1
## Mean Gmd .05 .10
## 2019-02-15 20:32:58 2542407 2019-01-06 12:10:57 2019-01-10 17:15:54
## .25 .50 .75 .90
## 2019-01-25 14:52:15 2019-02-15 17:05:30 2019-03-09 23:26:00 2019-03-22 19:07:24
## .95
## 2019-03-26 19:28:48
##
## lowest : 2019-01-01 11:40:00 2019-01-01 11:43:00 2019-01-01 13:55:00 2019-01-01 15:51:00 2019-01-01 19:07:00
## highest: 2019-03-30 16:34:00 2019-03-30 17:04:00 2019-03-30 17:58:00 2019-03-30 19:26:00 2019-03-30 20:37:00
## --------------------------------------------------------------------------------
## Payment
## n missing distinct
## 800 0 3
##
## Value Cash Credit card Ewallet
## Frequency 274 245 281
## Proportion 0.342 0.306 0.351
## --------------------------------------------------------------------------------
## cogs
## n missing distinct Info Mean Gmd .05 .10
## 800 0 792 1 221.2 170.5 32.95 53.43
## .25 .50 .75 .90 .95
## 93.06 185.79 311.44 454.45 511.53
##
## lowest : 10.17 12.09 12.54 12.78 13.98, highest: 713.80 819.50 825.80 874.98 878.70
## --------------------------------------------------------------------------------
## Rating
## n missing distinct Info Mean Gmd .05 .10
## 800 0 61 1 7.014 1.974 4.3 4.6
## .25 .50 .75 .90 .95
## 5.6 7.0 8.5 9.4 9.7
##
## lowest : 4.0 4.1 4.2 4.3 4.4, highest: 9.6 9.7 9.8 9.9 10.0
## --------------------------------------------------------------------------------
## tmed
## n missing distinct Info Mean Gmd .05 .10
## 800 0 88 1 26.8 2.948 22.2 23.2
## .25 .50 .75 .90 .95
## 25.1 27.0 28.5 30.0 30.6
##
## lowest : 19.8 20.1 20.5 21.2 21.3, highest: 31.4 31.7 32.8 33.6 34.0
## --------------------------------------------------------------------------------
## day
## n missing distinct Info Mean Gmd
## 800 0 7 0.979 4.093 2.286
##
## lowest : 1 2 3 4 5, highest: 3 4 5 6 7
##
## Value 1 2 3 4 5 6 7
## Frequency 105 104 123 118 109 112 129
## Proportion 0.131 0.130 0.154 0.148 0.136 0.140 0.161
## --------------------------------------------------------------------------------
## month
## n missing distinct Info Mean Gmd
## 800 0 3 0.887 2.03 0.9117
##
## Value 1 2 3
## Frequency 269 238 293
## Proportion 0.336 0.298 0.366
## --------------------------------------------------------------------------------
## week
## n missing distinct Info Mean Gmd .05 .10
## 800 0 13 0.994 7.171 4.199 1 2
## .25 .50 .75 .90 .95
## 4 7 10 12 13
##
## lowest : 1 2 3 4 5, highest: 9 10 11 12 13
##
## Value 1 2 3 4 5 6 7 8 9 10 11
## Frequency 44 58 58 74 58 75 54 50 68 73 67
## Proportion 0.055 0.072 0.072 0.092 0.072 0.094 0.068 0.062 0.085 0.091 0.084
##
## Value 12 13
## Frequency 63 58
## Proportion 0.079 0.072
## --------------------------------------------------------------------------------
## hour
## n missing distinct Info Mean Gmd .05 .10
## 800 0 11 0.991 15.01 3.646 10 11
## .25 .50 .75 .90 .95
## 12 15 18 19 20
##
## lowest : 10 11 12 13 14, highest: 16 17 18 19 20
##
## Value 10 11 12 13 14 15 16 17 18 19 20
## Frequency 76 66 73 83 64 80 64 62 80 90 62
## Proportion 0.095 0.082 0.091 0.104 0.080 0.100 0.080 0.078 0.100 0.112 0.078
## --------------------------------------------------------------------------------
## daynum
## n missing distinct Info Mean Gmd .05 .10
## 800 0 31 0.999 15.38 10.06 2 4
## .25 .50 .75 .90 .95
## 8 15 23 27 29
##
## lowest : 1 2 3 4 5, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## cluster
## n missing distinct Info Mean Gmd
## 800 0 1 0 1 0
##
## Value 1
## Frequency 800
## Proportion 1
## --------------------------------------------------------------------------------
# Caracteristicas cluster 2
dfh2 <- filter(dfh, cluster == 2)
describe(dfh2)
## dfh2
##
## 19 Variables 200 Observations
## --------------------------------------------------------------------------------
## City
## n missing distinct
## 200 0 3
##
## Value Mandalay Naypyitaw Yangon
## Frequency 65 78 57
## Proportion 0.325 0.390 0.285
## --------------------------------------------------------------------------------
## Customer.type
## n missing distinct
## 200 0 2
##
## Value Member Normal
## Frequency 107 93
## Proportion 0.535 0.465
## --------------------------------------------------------------------------------
## Gender
## n missing distinct
## 200 0 2
##
## Value Female Male
## Frequency 105 95
## Proportion 0.525 0.475
## --------------------------------------------------------------------------------
## Product.line
## n missing distinct
## 200 0 6
##
## lowest : Electronic Fashion_accessories Food&Beverages Health&Beauty Home&Lifestyle
## highest: Fashion_accessories Food&Beverages Health&Beauty Home&Lifestyle Sports&Travel
##
## Value Electronic Fashion_accessories Food&Beverages
## Frequency 38 30 23
## Proportion 0.190 0.150 0.115
##
## Value Health&Beauty Home&Lifestyle Sports&Travel
## Frequency 25 30 54
## Proportion 0.125 0.150 0.270
## --------------------------------------------------------------------------------
## Unit.price
## n missing distinct Info Mean Gmd .05 .10
## 200 0 197 1 77.96 18.42 47.63 55.66
## .25 .50 .75 .90 .95
## 66.61 79.90 90.71 97.53 99.10
##
## lowest : 21.43 31.99 37.32 43.13 44.02, highest: 99.55 99.56 99.73 99.82 99.96
## --------------------------------------------------------------------------------
## Quantity
## n missing distinct Info Mean Gmd
## 200 0 7 0.942 8.435 1.655
##
## lowest : 4 5 6 7 8, highest: 6 7 8 9 10
##
## Value 4 5 6 7 8 9 10
## Frequency 4 3 14 37 32 43 67
## Proportion 0.020 0.015 0.070 0.185 0.160 0.215 0.335
## --------------------------------------------------------------------------------
## Tax.5.
## n missing distinct Info Mean Gmd .05 .10
## 200 0 198 1 32.65 9.665 19.02 21.24
## .25 .50 .75 .90 .95
## 26.68 33.66 38.65 44.34 45.25
##
## lowest : 10.7150 13.4520 15.9530 15.9950 16.2425
## highest: 48.6900 48.7500 49.2600 49.4900 49.6500
## --------------------------------------------------------------------------------
## Total
## n missing distinct Info Mean Gmd .05 .10
## 200 0 198 1 685.6 203 399.3 446.0
## .25 .50 .75 .90 .95
## 560.2 706.9 811.5 931.2 950.3
##
## lowest : 225.0150 282.4920 335.0130 335.8950 341.0925
## highest: 1022.4900 1023.7500 1034.4600 1039.2900 1042.6500
## --------------------------------------------------------------------------------
## datetime
## n missing distinct Info
## 200 0 198 1
## Mean Gmd .05 .10
## 2019-02-09 19:20:24 2300585 2019-01-06 12:41:51 2019-01-12 09:03:42
## .25 .50 .75 .90
## 2019-01-21 09:00:15 2019-02-07 12:57:30 2019-03-01 23:42:15 2019-03-14 22:24:18
## .95
## 2019-03-20 11:46:48
##
## lowest : 2019-01-01 10:39:00 2019-01-01 11:36:00 2019-01-01 14:42:00 2019-01-01 14:47:00 2019-01-03 19:08:00
## highest: 2019-03-23 13:23:00 2019-03-24 18:27:00 2019-03-25 18:30:00 2019-03-27 10:43:00 2019-03-29 19:12:00
## --------------------------------------------------------------------------------
## Payment
## n missing distinct
## 200 0 3
##
## Value Cash Credit card Ewallet
## Frequency 70 66 64
## Proportion 0.35 0.33 0.32
## --------------------------------------------------------------------------------
## cogs
## n missing distinct Info Mean Gmd .05 .10
## 200 0 198 1 653 193.3 380.3 424.7
## .25 .50 .75 .90 .95
## 533.6 673.3 772.9 886.8 905.1
##
## lowest : 214.30 269.04 319.06 319.90 324.85, highest: 973.80 975.00 985.20 989.80 993.00
## --------------------------------------------------------------------------------
## Rating
## n missing distinct Info Mean Gmd .05 .10
## 200 0 59 1 6.809 2.023 4.200 4.490
## .25 .50 .75 .90 .95
## 5.300 6.650 8.300 9.310 9.605
##
## lowest : 4.0 4.1 4.2 4.3 4.4, highest: 9.6 9.7 9.8 9.9 10.0
## --------------------------------------------------------------------------------
## tmed
## n missing distinct Info Mean Gmd .05 .10
## 200 0 60 0.999 26.28 2.712 22.1 22.9
## .25 .50 .75 .90 .95
## 24.9 26.4 27.7 29.5 30.0
##
## lowest : 20.1 20.5 21.2 21.3 21.6, highest: 30.0 30.1 30.5 31.4 34.0
## --------------------------------------------------------------------------------
## day
## n missing distinct Info Mean Gmd
## 200 0 7 0.978 4.135 2.322
##
## lowest : 1 2 3 4 5, highest: 3 4 5 6 7
##
## Value 1 2 3 4 5 6 7
## Frequency 28 21 35 25 29 27 35
## Proportion 0.140 0.105 0.175 0.125 0.145 0.135 0.175
## --------------------------------------------------------------------------------
## month
## n missing distinct Info Mean Gmd
## 200 0 3 0.877 1.845 0.8747
##
## Value 1 2 3
## Frequency 83 65 52
## Proportion 0.415 0.325 0.260
## --------------------------------------------------------------------------------
## week
## n missing distinct Info Mean Gmd .05 .10
## 200 0 13 0.992 6.295 3.792 1.00 2.00
## .25 .50 .75 .90 .95
## 3.75 6.00 9.00 11.00 12.00
##
## lowest : 1 2 3 4 5, highest: 9 10 11 12 13
##
## Value 1 2 3 4 5 6 7 8 9 10 11
## Frequency 11 15 24 19 25 17 18 10 19 15 11
## Proportion 0.055 0.075 0.120 0.095 0.125 0.085 0.090 0.050 0.095 0.075 0.055
##
## Value 12 13
## Frequency 13 3
## Proportion 0.065 0.015
## --------------------------------------------------------------------------------
## hour
## n missing distinct Info Mean Gmd .05 .10
## 200 0 11 0.99 14.53 3.711 10 10
## .25 .50 .75 .90 .95
## 12 14 17 19 20
##
## lowest : 10 11 12 13 14, highest: 16 17 18 19 20
##
## Value 10 11 12 13 14 15 16 17 18 19 20
## Frequency 25 24 16 20 19 22 13 12 13 23 13
## Proportion 0.125 0.120 0.080 0.100 0.095 0.110 0.065 0.060 0.065 0.115 0.065
## --------------------------------------------------------------------------------
## daynum
## n missing distinct Info Mean Gmd .05 .10
## 200 0 31 0.998 14.77 9.937 2.00 3.00
## .25 .50 .75 .90 .95
## 8.00 15.00 22.25 27.00 28.00
##
## lowest : 1 2 3 4 5, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## cluster
## n missing distinct Info Mean Gmd
## 200 0 1 0 2 0
##
## Value 2
## Frequency 200
## Proportion 1
## --------------------------------------------------------------------------------
8. Predicción de los grupos del cluster
8.1 Feature Engineering
supermarket <- as.data.table(supermarket)
#Creamos variables con las frecuencias
supermarket[ , fe_city := .N , by = .(City)]
supermarket[ , fe_gender := .N , by = .(Gender)]
supermarket[ , fe_customer := .N , by = .(Customer.type)]
supermarket[ , fe_product := .N , by = .(Product.line)]
supermarket[ , fe_payment := .N , by = .(Payment)]
#Creamos variable long y latitud
# Mandalay long 21.959433, latitud 96.101045
# Naypitaw 19.740465, 96.090555
# Yangon 16.877067, 96.177399
supermarket$longitude = ifelse(supermarket$City == "Mandalay", 21.959433, ifelse(supermarket$City == "Naypyitaw", 19.740465, 16.877067))
supermarket$latitude = ifelse(supermarket$City == "Mandalay", 96.101045, ifelse(supermarket$City == "Naypyitaw", 96.090555, 96.177399))
#creamos variable lonlat
supermarket$fe_lonlat <- sqrt(supermarket$longitude^2 + supermarket$latitude^2)
#convertimos a dummies
supermarket1 <- fastDummies::dummy_cols(supermarket,remove_first_dummy = TRUE)
supermarket1[,1:4] <- NULL
supermarket1$Payment <- NULL
supermarket1$datetime <- NULL
#NORMALIZAR LOS DATOS
names (supermarket1)[1] = "UnitPrice"
names (supermarket1)[3] = "Tax5"
names (supermarket1)[21] = "CityNaypyitaw"
names (supermarket1)[22] = "CityYangon"
names (supermarket1)[23] = "CustomerTypeNormal"
names (supermarket1)[24] = "GenderMale"
names (supermarket1)[25] = "PLFashionAccessories"
names (supermarket1)[26] = "PLFoodBeverages"
names (supermarket1)[27] = "PLHealthBeauty"
names (supermarket1)[28] = "PLHomeLifestyle"
names (supermarket1)[29] = "PLSportsTravel"
names (supermarket1)[30] = "PaymentCreditCard"
names (supermarket1)[31] = "PaymentEwallet"
supersca <- scale(supermarket1)
supersca <- as.data.frame(supersca)
saveRDS(supersca, file = "supersca.rds")
supersca$cluster <- dfpam$cluster
#semilla para obtener mismos valores
set.seed(1234)
validationIndex <- createDataPartition(supersca$cluster, p=0.70, list=FALSE)
# Para validar
my_test1 <- supersca[-validationIndex,]
saveRDS(my_test1, file = "my_test1.rds")
# Para entrenar
my_train1 <- supersca[validationIndex,]
saveRDS(my_train1, file = "my_train1.rds")
prop.table(table(supersca$cluster))
##
## 1 2
## 0.67 0.33
prop.table(table(my_test1$cluster))
##
## 1 2
## 0.7233333 0.2766667
prop.table(table(my_train1$cluster))
##
## 1 2
## 0.6471429 0.3528571
8.2 K-Nearest-Neighbors
#EJEMPLO KNN (K-Nearest-Neighbors)
my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))
set.seed(1234)
knnpred <- knn(train = my_train1[,-32],
cl = my_train1[, 32],
test = my_test1[,-32],
k = 2)
# Porcentaje de acierto
PredKNN = mean(knnpred == my_test1$cluster)
print(paste0("Porcentaje de acierto/Accuracy: ",PredKNN ))
## [1] "Porcentaje de acierto/Accuracy: 0.77"
cfm <- as.data.frame(table(Pred = knnpred, Obj = my_test1$cluster))
# Errores de clasificacion
errclasKNN <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasKNN ))
## [1] "Errores de clasificación: 69"
# Matriz de confusion
plot_confusion_matrix(cfm,
targets_col = "Obj",
predictions_col = "Pred",
counts_col = "Freq")

pred1 <- prediction(as.numeric(knnpred), as.numeric(my_test1$cluster))
# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.729415357281661"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS
#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.855769230769231"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.576086956521739"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS
# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.820276497695853"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.63855421686747"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.83764705882353"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.605714285714286"
RESULTADOS <-data.frame(modelo = ('knn'),
accuracy =c(PredKNN),
AUC =c(auc),
precision1 =c(prec1),
precision2 =c(prec2),
recall1 =c(rec1),
recall2 =c(rec2),
f1grupo1 =c(f1gr1),
f1grupo2 =c(f1gr2),
errorClas =c(errclasKNN)
)
8.3 Support Vector Machines
my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))
my_test1$cluster <- as.factor(my_test1$cluster)
my_train1$cluster <- as.factor(my_train1$cluster)
set.seed(1234)
modeloSVM = train(form = cluster ~ ., data = my_train1, method = 'svmRadial')
modeloSVM$bestTune
## sigma C
## 3 0.01849074 1
pred_valid_SVM = predict(modeloSVM, newdata = my_test1[,-32])
#MATRIZ CONFUSION
cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_SVM))
cfm
## Obj Pred Freq
## 1 1 1 208
## 2 2 1 8
## 3 1 2 9
## 4 2 2 75
# Errores de validacion
errclasSVM <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasSVM ))
## [1] "Errores de clasificación: 17"
#MATRIZ CONFUSION
plot_confusion_matrix(cfm,
targets_col = "Obj",
predictions_col = "Pred",
counts_col = "Freq")

#Porcentaje de acierto
predSVM = mean(pred_valid_SVM == my_test1$cluster)
predSVM
## [1] 0.9433333
print(paste0("Porcentaje de acierto/Accuracy: ",predSVM ))
## [1] "Porcentaje de acierto/Accuracy: 0.943333333333333"
pred1 <- prediction(as.numeric(pred_valid_SVM), as.numeric(my_test1$cluster))
# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.931069901726723"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS
#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.95852534562212"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.903614457831325"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS
# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.962962962962963"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.892857142857143"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.960739030023095"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.898203592814371"
z <-data.frame(modelo = ('svm'),
accuracy =c(predSVM),
AUC =c(auc),
precision1 =c(prec1),
precision2 =c(prec2),
recall1 =c(rec1),
recall2 =c(rec2),
f1grupo1 =c(f1gr1),
f1grupo2 =c(f1gr2),
errorClas =c(errclasSVM)
)
RESULTADOS <-rbind(RESULTADOS,z)
8.4 Generalized Linear Model
*Regresión
my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))
library(stats)
#eliminamos variables correlacionadas
modeloGLM <- glm(as.factor(cluster) ~ ., family = binomial, data = my_train1[,-4:-5])
summary(modeloGLM)
##
## Call:
## glm(formula = as.factor(cluster) ~ ., family = binomial, data = my_train1[,
## -4:-5])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.842e-04 -2.100e-08 -2.100e-08 2.100e-08 2.849e-04
##
## Coefficients: (8 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -183.55 12864.02 -0.014 0.989
## UnitPrice 82.84 11570.41 0.007 0.994
## Quantity 30.90 14973.49 0.002 0.998
## Tax5 96.05 14920.70 0.006 0.995
## Rating -55.64 17530.29 -0.003 0.997
## tmed 93.73 9478.79 0.010 0.992
## day -110.15 12054.81 -0.009 0.993
## month -11.30 111681.39 0.000 1.000
## week 74.52 108682.13 0.001 0.999
## hour -19.80 17469.90 -0.001 0.999
## daynum 151.38 40764.04 0.004 0.997
## fe_city 86.49 9078.32 0.010 0.992
## fe_gender 145.95 10975.73 0.013 0.989
## fe_customer 141.36 9671.00 0.015 0.988
## fe_product -51.63 48966.42 -0.001 0.999
## fe_payment 4935.14 327392.85 0.015 0.988
## longitude -83.47 10544.28 -0.008 0.994
## latitude NA NA NA NA
## fe_lonlat NA NA NA NA
## CityNaypyitaw NA NA NA NA
## CityYangon NA NA NA NA
## CustomerTypeNormal NA NA NA NA
## GenderMale NA NA NA NA
## PLFashionAccessories 22.33 29788.73 0.001 0.999
## PLFoodBeverages 12.84 15948.70 0.001 0.999
## PLHealthBeauty -41.16 34227.62 -0.001 0.999
## PLHomeLifestyle -24.59 28149.10 -0.001 0.999
## PLSportsTravel NA NA NA NA
## PaymentCreditCard 4865.16 321386.27 0.015 0.988
## PaymentEwallet NA NA NA NA
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9.0888e+02 on 699 degrees of freedom
## Residual deviance: 6.0020e-07 on 678 degrees of freedom
## AIC: 44
##
## Number of Fisher Scoring iterations: 25
#se aprecia el p valor en su ultima columna, representa la relevancia estadística de la variable independiente como elemento predictivo
pred_valid_GLM <- predict(modeloGLM, type = 'response', newdata = my_test1[,-4:-5])
pred_valid_GLM <- ifelse(pred_valid_GLM > 0.5, 1, 0)
pred_valid_GLM <- factor(pred_valid_GLM, levels = c("0", "1"), labels = c("1", "2"))
cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_GLM))
cfm
## Obj Pred Freq
## 1 1 1 212
## 2 2 1 1
## 3 1 2 5
## 4 2 2 82
# Errores de clasificacion
errclasGLM <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasGLM ))
## [1] "Errores de clasificación: 6"
#Matriz confusion
plot_confusion_matrix(cfm,
targets_col = "Obj",
predictions_col = "Pred",
counts_col = "Freq")

predGLM = mean(pred_valid_GLM == my_test1$cluster)
print(paste0("Porcentaje de acierto/Accuracy: ",predGLM ))
## [1] "Porcentaje de acierto/Accuracy: 0.98"
pred1 <- prediction(as.numeric(pred_valid_GLM), as.numeric(my_test1$cluster))
# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.982455166287269"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS
#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.976958525345622"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.987951807228916"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS
# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.995305164319249"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.942528735632184"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.986046511627907"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.964705882352941"
z1 <-data.frame(modelo = ('glm'),
accuracy =c(predGLM),
AUC =c(auc),
precision1 =c(prec1),
precision2 =c(prec2),
recall1 =c(rec1),
recall2 =c(rec2),
f1grupo1 =c(f1gr1),
f1grupo2 =c(f1gr2),
errorClas =c(errclasGLM)
)
RESULTADOS <-rbind(RESULTADOS,z1)
8.5 Naive Bayes
my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))
my_test1$cluster <- as.factor(my_test1$cluster)
my_train1$cluster <- as.factor(my_train1$cluster)
set.seed(1234)
modeloBayes <- naiveBayes(cluster ~ ., data = my_train1)
pred_valid_BAYES <- predict(modeloBayes, newdata = my_test1)
cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_BAYES))
cfm
## Obj Pred Freq
## 1 1 1 190
## 2 2 1 18
## 3 1 2 27
## 4 2 2 65
# Errores de clasificacion
errclasBAYES <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasBAYES ))
## [1] "Errores de clasificación: 45"
# Matriz de confusion
plot_confusion_matrix(cfm,
targets_col = "Obj",
predictions_col = "Pred",
counts_col = "Freq")

#ACCURACY
predBAYES = mean(pred_valid_BAYES == my_test1$cluster)
print(paste0("Porcentaje de acierto/Accuracy: ",predBAYES ))
## [1] "Porcentaje de acierto/Accuracy: 0.85"
pred1 <- prediction(as.numeric(pred_valid_BAYES), as.numeric(my_test1$cluster))
# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.829354283493421"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS
#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.875576036866359"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.783132530120482"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS
# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.913461538461538"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.706521739130435"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.894117647058824"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.742857142857143"
z2 <-data.frame(modelo = ('NaiveBayes'),
accuracy =c(predBAYES),
AUC =c(auc),
precision1 =c(prec1),
precision2 =c(prec2),
recall1 =c(rec1),
recall2 =c(rec2),
f1grupo1 =c(f1gr1),
f1grupo2 =c(f1gr2),
errorClas =c(errclasBAYES)
)
RESULTADOS <-rbind(RESULTADOS,z2)
8.6 Arbol de decisión RPART
my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))
my_test1$cluster <- as.factor(my_test1$cluster)
my_train1$cluster <- as.factor(my_train1$cluster)
#Hallar mejor minsplit que es 5
obj3 <- tune.rpart(cluster~., data = my_train1, minsplit = c(5,10,15))
summary(obj3)
##
## Parameter tuning of 'rpart.wrapper':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## minsplit
## 5
##
## - best performance: 0.22
##
## - Detailed performance results:
## minsplit error dispersion
## 1 5 0.2200000 0.06324555
## 2 10 0.2200000 0.06324555
## 3 15 0.2228571 0.06179477
plot(obj3)

set.seed(1234)
modeloDT <- rpart(cluster ~ ., data = my_train1, minsplit = 5)
pred_valid_DT <- predict(modeloDT, newdata = my_test1, type = 'class')
cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_DT))
cfm
## Obj Pred Freq
## 1 1 1 184
## 2 2 1 28
## 3 1 2 33
## 4 2 2 55
# Errores de clasificacion
errclasDT <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasDT ))#54 errores
## [1] "Errores de clasificación: 61"
# Matriz de confusion
plot_confusion_matrix(cfm,
targets_col = "Obj",
predictions_col = "Pred",
counts_col = "Freq")

predRPART = mean(pred_valid_DT == my_test1$cluster)
print(paste0("Porcentaje de acierto/Accuracy: ",predRPART ))
## [1] "Porcentaje de acierto/Accuracy: 0.796666666666667"
pred1 <- prediction(as.numeric(pred_valid_DT), as.numeric(my_test1$cluster))
# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.755288434845372"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS
#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.847926267281106"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.662650602409639"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS
# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.867924528301887"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.625"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.857808857808858"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.64327485380117"
z3 <-data.frame(modelo = ('rpart'),
accuracy =c(predRPART),
AUC =c(auc),
precision1 =c(prec1),
precision2 =c(prec2),
recall1 =c(rec1),
recall2 =c(rec2),
f1grupo1 =c(f1gr1),
f1grupo2 =c(f1gr2),
errorClas =c(errclasDT)
)
RESULTADOS <-rbind(RESULTADOS,z3)
8.7 Latent Dirichlet Allocation
my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))
lda <- lda(cluster ~ ., data = my_train1)
coef(lda)
## LD1
## UnitPrice 0.17194038
## Quantity 0.01959883
## Tax5 0.17275307
## Total 0.17275307
## cogs 0.17275307
## Rating -0.21915703
## tmed 0.31981099
## day -0.34177656
## month 0.34371771
## week -0.14997366
## hour -0.01282634
## daynum 0.70094224
## fe_city 0.11130508
## fe_gender 0.27364556
## fe_customer 0.24515802
## fe_product 0.02305005
## fe_payment 0.00737049
## longitude -0.09420155
## latitude 0.11462921
## fe_lonlat -0.08722340
## CityNaypyitaw -0.06804700
## CityYangon 0.11389982
## CustomerTypeNormal -0.24515802
## GenderMale -0.27364556
## PLFashionAccessories -0.09121014
## PLFoodBeverages 0.03124522
## PLHealthBeauty -0.06811002
## PLHomeLifestyle 0.01224416
## PLSportsTravel -0.05500083
## PaymentCreditCard 0.01005304
## PaymentEwallet 0.58871422
pred <- predict(lda, my_test1)
pred_valid_LDA <- as.numeric(pred$class)
cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_LDA))
cfm
## Obj Pred Freq
## 1 1 1 210
## 2 2 1 4
## 3 1 2 7
## 4 2 2 79
# Errores de clasificacion
errclasLDA <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasLDA ))
## [1] "Errores de clasificación: 11"
# Matriz de confusion
plot_confusion_matrix(cfm,
targets_col = "Obj",
predictions_col = "Pred",
counts_col = "Freq")

predLDA = mean(pred_valid_LDA == my_test1$cluster)
predLDA
## [1] 0.9633333
print(paste0("Porcentaje de acierto/Accuracy: ",predLDA ))
## [1] "Porcentaje de acierto/Accuracy: 0.963333333333333"
pred1 <- prediction(as.numeric(pred_valid_LDA), as.numeric(my_test1$cluster))
# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.959774582199767"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS
#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.967741935483871"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.951807228915663"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS
# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.981308411214953"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.918604651162791"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.974477958236659"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.93491124260355"
z4 <-data.frame(modelo = ('lda'),
accuracy =c(predLDA),
AUC =c(auc),
precision1 =c(prec1),
precision2 =c(prec2),
recall1 =c(rec1),
recall2 =c(rec2),
f1grupo1 =c(f1gr1),
f1grupo2 =c(f1gr2),
errorClas =c(errclasLDA)
)
RESULTADOS <-rbind(RESULTADOS,z4)
8.8 Adabag Boosting
my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))
my_test1$cluster <- as.factor(my_test1$cluster)
my_train1$cluster <- as.factor(my_train1$cluster)
library(adabag)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
model = boosting(cluster~., data=my_train1, boos=TRUE, mfinal=50)
print(names(model))
## [1] "formula" "trees" "weights" "votes" "prob"
## [6] "class" "importance" "terms" "call"
model$importance
## CityNaypyitaw CityYangon cogs
## 0.00000000 0.00000000 0.00000000
## CustomerTypeNormal day daynum
## 0.74919941 5.57559368 13.03371994
## fe_city fe_customer fe_gender
## 5.65445964 5.99890496 5.76982886
## fe_lonlat fe_payment fe_product
## 0.00000000 8.48458151 1.83541540
## GenderMale hour latitude
## 1.00698081 3.49894062 0.00000000
## longitude month PaymentCreditCard
## 1.35292374 0.06094276 0.06981940
## PaymentEwallet PLFashionAccessories PLFoodBeverages
## 0.00000000 0.00000000 0.86212968
## PLHealthBeauty PLHomeLifestyle PLSportsTravel
## 0.00000000 0.45846986 0.32577381
## Quantity Rating Tax5
## 2.85153048 7.97665258 12.63500103
## tmed Total UnitPrice
## 10.51587883 0.00000000 8.02500179
## week
## 3.25825118
pred = predict(model, my_test1)
print(names(pred))
## [1] "formula" "votes" "prob" "class" "confusion" "error"
print(pred$confusion)
## Observed Class
## Predicted Class 1 2
## 1 202 14
## 2 15 69
print(pred$error)
## [1] 0.09666667
pred_valid_ADABAG <- pred$class
cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_ADABAG))
cfm
## Obj Pred Freq
## 1 1 1 202
## 2 2 1 14
## 3 1 2 15
## 4 2 2 69
# Errores de clasificacion
errclasADABAG<- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasADABAG ))
## [1] "Errores de clasificación: 29"
# Matriz de confusion
plot_confusion_matrix(cfm,
targets_col = "Obj",
predictions_col = "Pred",
counts_col = "Freq")

predADABAG = mean(pred_valid_ADABAG == my_test1$cluster)
predADABAG
## [1] 0.9033333
print(paste0("Porcentaje de acierto/Accuracy: ",predADABAG ))
## [1] "Porcentaje de acierto/Accuracy: 0.903333333333333"
pred1 <- prediction(as.numeric(pred_valid_ADABAG), as.numeric(my_test1$cluster))
# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.881100438620843"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS
#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.930875576036866"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.831325301204819"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS
# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.935185185185185"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.821428571428571"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.933025404157044"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.826347305389222"
z5 <-data.frame(modelo = ('adabag'),
accuracy =c(predADABAG),
AUC =c(auc),
precision1 =c(prec1),
precision2 =c(prec2),
recall1 =c(rec1),
recall2 =c(rec2),
f1grupo1 =c(f1gr1),
f1grupo2 =c(f1gr2),
errorClas =c(errclasADABAG)
)
RESULTADOS <-rbind(RESULTADOS,z5)
#probabilidad de predecir cada clase
result = data.frame(my_test1$cluster, pred$prob, pred$class)
print(result)
## my_test1.cluster X1 X2 pred.class
## 1 1 0.98044479 0.01955521 1
## 2 1 0.65719870 0.34280130 1
## 3 1 0.84465085 0.15534915 1
## 4 1 0.88509134 0.11490866 1
## 5 1 0.82103833 0.17896167 1
## 6 1 0.67525775 0.32474225 1
## 7 1 0.87795282 0.12204718 1
## 8 1 0.69595980 0.30404020 1
## 9 1 0.87067310 0.12932690 1
## 10 1 0.37756652 0.62243348 2
## 11 1 0.72783974 0.27216026 1
## 12 1 0.85719189 0.14280811 1
## 13 1 0.84544412 0.15455588 1
## 14 1 0.74537205 0.25462795 1
## 15 1 0.79382014 0.20617986 1
## 16 2 0.63579342 0.36420658 1
## 17 1 0.73538188 0.26461812 1
## 18 1 0.85851076 0.14148924 1
## 19 1 0.83021034 0.16978966 1
## 20 1 0.94456789 0.05543211 1
## 21 1 0.66026878 0.33973122 1
## 22 2 0.67174560 0.32825440 1
## 23 1 1.00000000 0.00000000 1
## 24 1 0.74533376 0.25466624 1
## 25 1 0.76903742 0.23096258 1
## 26 1 0.78237583 0.21762417 1
## 27 1 0.77195257 0.22804743 1
## 28 1 0.53978897 0.46021103 1
## 29 1 0.89159380 0.10840620 1
## 30 1 0.86567834 0.13432166 1
## 31 1 0.82477625 0.17522375 1
## 32 1 0.60470923 0.39529077 1
## 33 1 0.82043569 0.17956431 1
## 34 1 0.87925919 0.12074081 1
## 35 1 0.57908722 0.42091278 1
## 36 1 0.84304697 0.15695303 1
## 37 1 0.90036947 0.09963053 1
## 38 1 0.68300615 0.31699385 1
## 39 2 0.45580125 0.54419875 2
## 40 1 0.90522610 0.09477390 1
## 41 1 0.76918380 0.23081620 1
## 42 1 0.75172238 0.24827762 1
## 43 1 0.79917020 0.20082980 1
## 44 1 0.88517266 0.11482734 1
## 45 2 0.17418392 0.82581608 2
## 46 1 0.84489194 0.15510806 1
## 47 1 0.55950838 0.44049162 1
## 48 1 0.95765517 0.04234483 1
## 49 2 0.33289662 0.66710338 2
## 50 2 0.29193571 0.70806429 2
## 51 1 0.74044540 0.25955460 1
## 52 1 0.42447065 0.57552935 2
## 53 1 0.87850362 0.12149638 1
## 54 1 0.86321446 0.13678554 1
## 55 1 0.44030269 0.55969731 2
## 56 2 0.46875616 0.53124384 2
## 57 1 0.94066313 0.05933687 1
## 58 1 0.86387990 0.13612010 1
## 59 1 0.83393174 0.16606826 1
## 60 1 0.88280893 0.11719107 1
## 61 1 0.93752696 0.06247304 1
## 62 1 0.89304743 0.10695257 1
## 63 2 0.29810034 0.70189966 2
## 64 1 0.85679078 0.14320922 1
## 65 2 0.49296422 0.50703578 2
## 66 1 0.76090245 0.23909755 1
## 67 2 0.58718727 0.41281273 1
## 68 1 0.81579296 0.18420704 1
## 69 1 0.88149561 0.11850439 1
## 70 1 0.78109662 0.21890338 1
## 71 1 0.87559865 0.12440135 1
## 72 1 0.83534798 0.16465202 1
## 73 1 0.70094347 0.29905653 1
## 74 1 0.93845853 0.06154147 1
## 75 2 0.39393461 0.60606539 2
## 76 1 0.77393195 0.22606805 1
## 77 2 0.56496819 0.43503181 1
## 78 1 0.64280600 0.35719400 1
## 79 1 0.86487183 0.13512817 1
## 80 1 0.82146535 0.17853465 1
## 81 2 0.23398233 0.76601767 2
## 82 1 0.70897289 0.29102711 1
## 83 1 0.90426470 0.09573530 1
## 84 1 0.69009484 0.30990516 1
## 85 2 0.42362262 0.57637738 2
## 86 1 0.84429444 0.15570556 1
## 87 1 0.72654995 0.27345005 1
## 88 1 0.71636171 0.28363829 1
## 89 1 0.81752170 0.18247830 1
## 90 2 0.34730122 0.65269878 2
## 91 2 0.26576042 0.73423958 2
## 92 2 0.31485304 0.68514696 2
## 93 2 0.28321468 0.71678532 2
## 94 1 0.47486420 0.52513580 2
## 95 1 0.77925697 0.22074303 1
## 96 1 0.44735913 0.55264087 2
## 97 1 0.92002788 0.07997212 1
## 98 1 0.51885586 0.48114414 1
## 99 2 0.38727675 0.61272325 2
## 100 1 0.47038133 0.52961867 2
## 101 1 0.75834099 0.24165901 1
## 102 1 0.82361396 0.17638604 1
## 103 1 0.52383583 0.47616417 1
## 104 1 0.71247243 0.28752757 1
## 105 2 0.27633649 0.72366351 2
## 106 2 0.31529910 0.68470090 2
## 107 1 0.86308501 0.13691499 1
## 108 2 0.34363523 0.65636477 2
## 109 1 0.63188820 0.36811180 1
## 110 2 0.21805036 0.78194964 2
## 111 1 0.86935189 0.13064811 1
## 112 1 0.57236525 0.42763475 1
## 113 1 0.54188346 0.45811654 1
## 114 1 0.74872076 0.25127924 1
## 115 1 0.93718889 0.06281111 1
## 116 1 0.90205014 0.09794986 1
## 117 1 0.82279648 0.17720352 1
## 118 1 0.71355729 0.28644271 1
## 119 1 0.89257065 0.10742935 1
## 120 2 0.29803792 0.70196208 2
## 121 2 0.49948245 0.50051755 2
## 122 1 0.86411343 0.13588657 1
## 123 1 0.92427702 0.07572298 1
## 124 1 0.68940179 0.31059821 1
## 125 1 0.78588045 0.21411955 1
## 126 1 0.70703190 0.29296810 1
## 127 1 0.78272970 0.21727030 1
## 128 1 0.83921883 0.16078117 1
## 129 1 0.62152915 0.37847085 1
## 130 2 0.50944058 0.49055942 1
## 131 1 0.89182988 0.10817012 1
## 132 1 0.94343071 0.05656929 1
## 133 1 0.78643067 0.21356933 1
## 134 1 0.90624065 0.09375935 1
## 135 1 0.84101654 0.15898346 1
## 136 1 0.96519064 0.03480936 1
## 137 1 0.97917164 0.02082836 1
## 138 1 0.90456314 0.09543686 1
## 139 1 0.87054760 0.12945240 1
## 140 1 0.68068381 0.31931619 1
## 141 1 0.98303948 0.01696052 1
## 142 1 0.84364465 0.15635535 1
## 143 1 0.94026440 0.05973560 1
## 144 1 0.70068669 0.29931331 1
## 145 1 0.87501129 0.12498871 1
## 146 1 0.76365204 0.23634796 1
## 147 1 0.27369253 0.72630747 2
## 148 1 0.50973397 0.49026603 1
## 149 1 0.88042793 0.11957207 1
## 150 2 0.60788478 0.39211522 1
## 151 1 0.79657147 0.20342853 1
## 152 1 0.86042181 0.13957819 1
## 153 2 0.17561941 0.82438059 2
## 154 1 0.74364689 0.25635311 1
## 155 1 0.83112958 0.16887042 1
## 156 1 0.87796524 0.12203476 1
## 157 1 0.68095381 0.31904619 1
## 158 1 0.74841007 0.25158993 1
## 159 1 0.53205151 0.46794849 1
## 160 1 0.80065240 0.19934760 1
## 161 1 0.74748940 0.25251060 1
## 162 1 0.71490212 0.28509788 1
## 163 2 0.40706453 0.59293547 2
## 164 1 0.66357758 0.33642242 1
## 165 1 0.84069592 0.15930408 1
## 166 1 0.90614964 0.09385036 1
## 167 1 0.88653756 0.11346244 1
## 168 1 0.80676697 0.19323303 1
## 169 1 0.78301156 0.21698844 1
## 170 2 0.13202779 0.86797221 2
## 171 2 0.29566826 0.70433174 2
## 172 1 0.41959602 0.58040398 2
## 173 2 0.31651790 0.68348210 2
## 174 2 0.48356159 0.51643841 2
## 175 1 0.94401337 0.05598663 1
## 176 2 0.45602587 0.54397413 2
## 177 1 0.46200831 0.53799169 2
## 178 1 0.39196416 0.60803584 2
## 179 2 0.49726587 0.50273413 2
## 180 1 0.64529322 0.35470678 1
## 181 2 0.24682300 0.75317700 2
## 182 1 0.72048090 0.27951910 1
## 183 1 0.73982570 0.26017430 1
## 184 1 0.72578471 0.27421529 1
## 185 1 0.83967392 0.16032608 1
## 186 2 0.09727077 0.90272923 2
## 187 2 0.23006488 0.76993512 2
## 188 2 0.15427183 0.84572817 2
## 189 2 0.01696052 0.98303948 2
## 190 1 0.72165895 0.27834105 1
## 191 2 0.17419515 0.82580485 2
## 192 2 0.17786808 0.82213192 2
## 193 1 0.51028818 0.48971182 1
## 194 2 0.37741936 0.62258064 2
## 195 1 0.53651474 0.46348526 1
## 196 1 0.47165875 0.52834125 2
## 197 1 0.70490064 0.29509936 1
## 198 1 0.50949007 0.49050993 1
## 199 1 0.76133652 0.23866348 1
## 200 1 0.71761376 0.28238624 1
## 201 1 0.97799229 0.02200771 1
## 202 1 0.64610033 0.35389967 1
## 203 1 0.81390759 0.18609241 1
## 204 1 0.67015518 0.32984482 1
## 205 1 0.92418157 0.07581843 1
## 206 1 0.80742336 0.19257664 1
## 207 1 0.77262900 0.22737100 1
## 208 1 0.90194271 0.09805729 1
## 209 1 0.96405239 0.03594761 1
## 210 1 0.65732347 0.34267653 1
## 211 2 0.54113726 0.45886274 1
## 212 1 0.88222737 0.11777263 1
## 213 1 0.70548357 0.29451643 1
## 214 2 0.40498594 0.59501406 2
## 215 1 0.84026703 0.15973297 1
## 216 2 0.09364869 0.90635131 2
## 217 1 0.84149769 0.15850231 1
## 218 1 0.92126921 0.07873079 1
## 219 2 0.54061876 0.45938124 1
## 220 1 0.71925762 0.28074238 1
## 221 1 0.91641207 0.08358793 1
## 222 1 0.86912957 0.13087043 1
## 223 1 0.90161560 0.09838440 1
## 224 1 0.97852177 0.02147823 1
## 225 1 0.70785463 0.29214537 1
## 226 1 0.70660916 0.29339084 1
## 227 1 0.90546969 0.09453031 1
## 228 1 0.94158507 0.05841493 1
## 229 1 0.83590433 0.16409567 1
## 230 2 0.39777233 0.60222767 2
## 231 1 0.88805921 0.11194079 1
## 232 1 1.00000000 0.00000000 1
## 233 2 0.38615503 0.61384497 2
## 234 2 0.43601802 0.56398198 2
## 235 1 0.72189106 0.27810894 1
## 236 2 0.64811031 0.35188969 1
## 237 2 0.40748631 0.59251369 2
## 238 2 0.57294670 0.42705330 1
## 239 1 0.84490652 0.15509348 1
## 240 1 0.49678086 0.50321914 2
## 241 1 0.74871891 0.25128109 1
## 242 1 0.72976362 0.27023638 1
## 243 1 0.81659507 0.18340493 1
## 244 2 0.31099522 0.68900478 2
## 245 1 0.87725456 0.12274544 1
## 246 1 0.72164706 0.27835294 1
## 247 1 0.90510091 0.09489909 1
## 248 2 0.55846964 0.44153036 1
## 249 2 0.39852599 0.60147401 2
## 250 1 0.69617343 0.30382657 1
## 251 1 0.76532590 0.23467410 1
## 252 1 0.70305767 0.29694233 1
## 253 2 0.13920397 0.86079603 2
## 254 2 0.46293283 0.53706717 2
## 255 1 0.87668590 0.12331410 1
## 256 2 0.47389257 0.52610743 2
## 257 1 0.42504843 0.57495157 2
## 258 1 0.60112348 0.39887652 1
## 259 1 0.90088705 0.09911295 1
## 260 1 0.87822934 0.12177066 1
## 261 1 0.76531207 0.23468793 1
## 262 1 0.86047413 0.13952587 1
## 263 2 0.55164982 0.44835018 1
## 264 1 0.58014069 0.41985931 1
## 265 1 0.67719393 0.32280607 1
## 266 1 0.92503317 0.07496683 1
## 267 1 0.80566775 0.19433225 1
## 268 2 0.43496565 0.56503435 2
## 269 2 0.33326066 0.66673934 2
## 270 1 0.79867035 0.20132965 1
## 271 1 0.67811675 0.32188325 1
## 272 2 0.22354111 0.77645889 2
## 273 1 0.82372081 0.17627919 1
## 274 2 0.22086552 0.77913448 2
## 275 2 0.28833566 0.71166434 2
## 276 1 0.66028418 0.33971582 1
## 277 1 0.82057562 0.17942438 1
## 278 1 0.56606633 0.43393367 1
## 279 2 0.43850708 0.56149292 2
## 280 2 0.34375869 0.65624131 2
## 281 2 0.44940465 0.55059535 2
## 282 1 0.60038364 0.39961636 1
## 283 2 0.28083994 0.71916006 2
## 284 2 0.31348397 0.68651603 2
## 285 2 0.56170112 0.43829888 1
## 286 2 0.46961605 0.53038395 2
## 287 2 0.39558286 0.60441714 2
## 288 2 0.13942190 0.86057810 2
## 289 2 0.32028520 0.67971480 2
## 290 2 0.49348296 0.50651704 2
## 291 2 0.26125598 0.73874402 2
## 292 2 0.51951119 0.48048881 1
## 293 2 0.42594748 0.57405252 2
## 294 1 0.33731002 0.66268998 2
## 295 1 0.47164062 0.52835938 2
## 296 2 0.21735253 0.78264747 2
## 297 2 0.19592720 0.80407280 2
## 298 2 0.35786931 0.64213069 2
## 299 1 0.51146558 0.48853442 1
## 300 2 0.00000000 1.00000000 2
8.9 Random Forest
my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))
my_test1$cluster <- as.factor(my_test1$cluster)
my_train1$cluster <- as.factor(my_train1$cluster)
set.seed(1234)
#ELIMINAMOS VARIABLES COLINEALES Y MEJORAN LOS RESULTADOS
clasificadorRF <- randomForest(cluster ~ ., data = my_train1[,-4:-5], ntree = 250)
pred_valid_RF <- predict(clasificadorRF, newdata = my_test1[,-4:-5])
cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_RF))
cfm
## Obj Pred Freq
## 1 1 1 208
## 2 2 1 19
## 3 1 2 9
## 4 2 2 64
# Errores de clasificacion
errclasRF <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasRF ))
## [1] "Errores de clasificación: 28"
# Matriz de confusion
plot_confusion_matrix(cfm,
targets_col = "Obj",
predictions_col = "Pred",
counts_col = "Freq")

predRF = mean(pred_valid_RF == my_test1$cluster)
print(paste0("Porcentaje de acierto/Accuracy: ",predRF ))
## [1] "Porcentaje de acierto/Accuracy: 0.906666666666667"
pred1 <- prediction(as.numeric(pred_valid_RF), as.numeric(my_test1$cluster))
# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.864804841485759"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS
#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.95852534562212"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.771084337349398"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS
# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.916299559471366"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.876712328767123"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.936936936936937"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.82051282051282"
z6 <-data.frame(modelo = ('randomForest'),
accuracy =c(predRF),
AUC =c(auc),
precision1 =c(prec1),
precision2 =c(prec2),
recall1 =c(rec1),
recall2 =c(rec2),
f1grupo1 =c(f1gr1),
f1grupo2 =c(f1gr2),
errorClas =c(errclasRF)
)
RESULTADOS <-rbind(RESULTADOS,z6)
8.10 Ranger
my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))
#MODELO RANGER
# Grid de hiperparametros
hyper_grid <- expand.grid(
n.trees = c(100,150,250,500,750,2000),
node_size = seq(3, 9, by = 2),
sampe_size = c(.55, .632, .70, .80),
OOB_RMSE = 0
)
# Numero total de combinaciones
nrow(hyper_grid)
## [1] 96
set.seed(1234)
for(i in 1:nrow(hyper_grid)) {
# Modelo
model <- ranger(
formula = cluster ~ .,
data = my_train1,
num.trees = hyper_grid$n.trees[i],
min.node.size = hyper_grid$node_size[i],
sample.fraction = hyper_grid$sampe_size[i],
seed = 1234,
write.forest = TRUE,
splitrule = "gini",
verbose = TRUE,
classification = TRUE,
keep.inbag = TRUE
)
# añadimos OOB error
hyper_grid$OOB_RMSE[i] <- sqrt(model$prediction.error)
}
# MEJORES PARAMETROS
hyper_grid %>%
dplyr::arrange(OOB_RMSE) %>%
head(10)
## n.trees node_size sampe_size OOB_RMSE
## 1 750 3 0.800 0.3295018
## 2 750 7 0.800 0.3338092
## 3 2000 3 0.632 0.3359422
## 4 2000 7 0.632 0.3359422
## 5 750 3 0.632 0.3380617
## 6 100 9 0.632 0.3380617
## 7 100 3 0.632 0.3401680
## 8 100 7 0.632 0.3401680
## 9 250 9 0.632 0.3401680
## 10 750 5 0.800 0.3401680
#MODELO CON LOS MEJORES PARAMETROS PROPORCIONADOS POR EL GRID ANTERIOR, CON EL VALOR DE OOB_RMSE MÁS BAJO
set.seed(1234)
fit <- ranger(
cluster ~. ,
data = my_train1,
num.trees = 750,
importance = 'impurity',
write.forest = TRUE,
min.node.size = 3,
sample.fraction = .8,
splitrule = "gini",
verbose = TRUE,
classification = TRUE,
keep.inbag = TRUE
)
fit
## Ranger result
##
## Call:
## ranger(cluster ~ ., data = my_train1, num.trees = 750, importance = "impurity", write.forest = TRUE, min.node.size = 3, sample.fraction = 0.8, splitrule = "gini", verbose = TRUE, classification = TRUE, keep.inbag = TRUE)
##
## Type: Classification
## Number of trees: 750
## Sample size: 700
## Number of independent variables: 31
## Mtry: 5
## Target node size: 3
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error: 11.43 %
# variables importantes
vars_imp <- fit$variable.importance
vars_imp <- as.data.frame(vars_imp)
vars_imp$myvar <- rownames(vars_imp)
vars_imp <- as.data.table (vars_imp)
setorder(vars_imp, -vars_imp)
#importancia de las variables
library(ggpubr)
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:ggimage':
##
## theme_transparent
## The following object is masked from 'package:cvms':
##
## font
## The following object is masked from 'package:plyr':
##
## mutate
ggbarplot(vars_imp[1:10],
x = "myvar", y = "vars_imp",
#fill = 'myvar',
color = "blue", # Set bar border colors to white
palette = "jco", # jco journal color palett. see ?ggpar
sort.val = "asc", # Sort the value in descending order
sort.by.groups = FALSE, # Don't sort inside each group
x.text.angle = 90, # Rotate vertically x axis texts
ylab = "Importancia",
xlab = 'Variable',
#legend.title = "MPG Group",
rotate = TRUE,
ggtheme = theme_minimal()
)

#evaluar modelo
valor_pred <- predict(fit, data = my_test1)
pred_valid_RANGER <- valor_pred$predictions
cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_RANGER))
cfm
## Obj Pred Freq
## 1 1 1 206
## 2 2 1 20
## 3 1 2 11
## 4 2 2 63
# Error de clasificacion
errclasRANGER <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasRANGER ))
## [1] "Errores de clasificación: 31"
# Matriz de confusion
plot_confusion_matrix(cfm,
targets_col = "Obj",
predictions_col = "Pred",
counts_col = "Freq")

predRANGER = mean(pred_valid_RANGER == my_test1$cluster)
print(paste0("Porcentaje de acierto/Accuracy: ",predRANGER ))
## [1] "Porcentaje de acierto/Accuracy: 0.896666666666667"
pred1 <- prediction(as.numeric(pred_valid_RANGER), as.numeric(my_test1$cluster))
# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.854172450169341"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS
#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.949308755760369"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.759036144578313"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS
# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.911504424778761"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.851351351351351"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.930022573363431"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.802547770700637"
z7 <-data.frame(modelo = ('ranger'),
accuracy =c(predRANGER),
AUC =c(auc),
precision1 =c(prec1),
precision2 =c(prec2),
recall1 =c(rec1),
recall2 =c(rec2),
f1grupo1 =c(f1gr1),
f1grupo2 =c(f1gr2),
errorClas =c(errclasRANGER)
)
RESULTADOS <-rbind(RESULTADOS,z7)
8.11 Extreme Gradient Boosting
my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))
#Hyper parametrox
cv.ctrl <- trainControl(method = "repeatedcv", repeats = 1,number = 3,
#summaryFunction = twoClassSummary,
classProbs = FALSE,
allowParallel=T)
xgb.grid <- expand.grid(nrounds = 1000,
max_depth = c(2,4,6,10),
eta = c(0.05,0.1,0.2,0.5,1),
gamma = c(0.1, 0.3),
colsample_bytree = c(0.3, 0.5 , 0.7 ),
min_child_weight = c(1, 3,5,7),
subsample = c(0.25, 0.5,0.75,1))
set.seed(1234)
#Se comenta y descarga del modelo ya realizado, ya que dura más de 1hora
#xgb_tune1 <- train(as.factor(cluster) ~., data=my_train1, method="xgbTree", trControl=cv.ctrl, tuneGrid=xgb.grid, verbose=T, metric="Kappa", nthread =3)
#saveRDS(xgb_tune1, file = "xgb_tune1.rds")
xgb_tune1 <- readRDS(url('https://github.com/Juanmick/TFM/blob/master/xgb_tune1.rds?raw=true'))
# Best tuning parameter
xgb_tune1$bestTune
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 770 1000 2 0.2 0.1 0.3 1 0.5
set.seed(1234)
# Make predictions on the test data
pred_valid_XGB <- xgb_tune1 %>% predict(my_test1)
cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_XGB))
cfm
## Obj Pred Freq
## 1 1 1 204
## 2 2 1 8
## 3 1 2 13
## 4 2 2 75
# Errores de clasificacion
errclasXGB <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasXGB ))
## [1] "Errores de clasificación: 21"
# Matriz de confusion
plot_confusion_matrix(cfm,
targets_col = "Obj",
predictions_col = "Pred",
counts_col = "Freq")

predXGB = mean(pred_valid_XGB == my_test1$cluster)
print(paste0("Porcentaje de acierto/Accuracy: ",predXGB ))
## [1] "Porcentaje de acierto/Accuracy: 0.93"
pred1 <- prediction(as.numeric(pred_valid_XGB), as.numeric(my_test1$cluster))
# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.921853311864972"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS
#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.940092165898618"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.903614457831325"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS
# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.962264150943396"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.852272727272727"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.951048951048951"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.87719298245614"
z8 <-data.frame(modelo = ('xgb'),
accuracy =c(predXGB),
AUC =c(auc),
precision1 =c(prec1),
precision2 =c(prec2),
recall1 =c(rec1),
recall2 =c(rec2),
f1grupo1 =c(f1gr1),
f1grupo2 =c(f1gr2),
errorClas =c(errclasXGB)
)
RESULTADOS <-rbind(RESULTADOS,z8)
#variables importantes
varImp(xgb_tune1)
## xgbTree variable importance
##
## only 20 most important variables shown (out of 31)
##
## Overall
## daynum 100.000
## Tax5 72.798
## fe_payment 63.885
## tmed 59.014
## fe_customer 42.715
## UnitPrice 38.791
## fe_gender 36.604
## GenderMale 35.302
## day 35.144
## Rating 32.096
## week 31.460
## longitude 27.619
## Total 25.814
## cogs 24.276
## fe_city 20.136
## CustomerTypeNormal 18.598
## PaymentEwallet 17.959
## fe_lonlat 17.905
## hour 13.280
## latitude 8.838
8.12 Gradient Boosting Machine
my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))
my_test1$cluster <- as.factor(my_test1$cluster)
my_train1$cluster <- as.factor(my_train1$cluster)
set.seed(1234)
#fold cross validation para validar modelos
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
## repeated ten times
repeats = 10)
set.seed(1234)
gbmFit1 <- train(cluster~ ., data = my_train1, method = "gbm", trControl = fitControl, verbose = FALSE)
## This last option is actually one
## for gbm() that passes through
gbmFit1
## Stochastic Gradient Boosting
##
## 700 samples
## 31 predictor
## 2 classes: '1', '2'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 631, 630, 630, 629, 629, 630, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa
## 1 50 0.8724511 0.6989401
## 1 100 0.9207286 0.8186554
## 1 150 0.9356013 0.8546552
## 2 50 0.8990269 0.7666810
## 2 100 0.9313213 0.8454439
## 2 150 0.9350055 0.8545164
## 3 50 0.9024722 0.7766249
## 3 100 0.9253290 0.8315406
## 3 150 0.9327419 0.8494871
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth =
## 1, shrinkage = 0.1 and n.minobsinnode = 10.
#mejor interaction.depth 2
trellis.par.set(caretTheme())
plot(gbmFit1)

#100 arboles y 2 iteraciones,shrinkage = 0.1 y n.minobsinnode = 10.
trellis.par.set(caretTheme())
plot(gbmFit1, metric = "Kappa", plotType = "level",
scales = list(x = list(rot = 90)))

set.seed(1234)
pred_valid_GBM <- predict(gbmFit1, my_test1)
cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_GBM))
cfm
## Obj Pred Freq
## 1 1 1 211
## 2 2 1 13
## 3 1 2 6
## 4 2 2 70
# Error de clasificacion
errclasGBM <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasGBM ))
## [1] "Errores de clasificación: 19"
# Matriz de confusion
plot_confusion_matrix(cfm,
targets_col = "Obj",
predictions_col = "Pred",
counts_col = "Freq")

predGBM = mean(pred_valid_GBM == my_test1$cluster)
print(paste0("Porcentaje de acierto/Accuracy: ",predGBM ))
## [1] "Porcentaje de acierto/Accuracy: 0.936666666666667"
pred1 <- prediction(as.numeric(pred_valid_GBM), as.numeric(my_test1$cluster))
# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.907861862195325"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS
#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.972350230414747"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.843373493975904"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS
# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.941964285714286"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.921052631578947"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.956916099773243"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.880503144654088"
z9 <-data.frame(modelo = ('gbm'),
accuracy =c(predGBM),
AUC =c(auc),
precision1 =c(prec1),
precision2 =c(prec2),
recall1 =c(rec1),
recall2 =c(rec2),
f1grupo1 =c(f1gr1),
f1grupo2 =c(f1gr2),
errorClas =c(errclasGBM)
)
RESULTADOS <-rbind(RESULTADOS,z9)